diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c index d7ab9a4aae..544a5cb188 100644 --- a/backends/cuda-ref/ceed-cuda-ref-basis.c +++ b/backends/cuda-ref/ceed-cuda-ref-basis.c @@ -194,7 +194,9 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); } // Get read/write access to u, v @@ -220,16 +222,17 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); + CeedCallBackend( + CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); } break; case CEED_EVAL_WEIGHT: case CEED_EVAL_NONE: /* handled separately below */ diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index 0f6ca9d1cb..582b43a975 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -70,7 +70,9 @@ typedef struct { CUmodule moduleAtPoints; CeedInt num_points; CUfunction InterpAtPoints; + CUfunction InterpTransposeAtPoints; CUfunction GradAtPoints; + CUfunction GradTransposeAtPoints; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index bd2467e538..67f3d7ac7c 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -295,7 +295,9 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); } // Get read/write access to u, v @@ -321,16 +323,17 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); + CeedCallBackend( + CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); } break; case CEED_EVAL_WEIGHT: case CEED_EVAL_NONE: /* handled separately below */ diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h index d70d75ab94..f42f2b1cff 100644 --- a/backends/cuda-shared/ceed-cuda-shared.h +++ b/backends/cuda-shared/ceed-cuda-shared.h @@ -22,7 +22,9 @@ typedef struct { CUmodule moduleAtPoints; CeedInt num_points; CUfunction InterpAtPoints; + CUfunction InterpTransposeAtPoints; CUfunction GradAtPoints; + CUfunction GradTransposeAtPoints; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_collo_grad_1d; diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c index 8fb3d3fa20..7fdfb9f16d 100644 --- a/backends/hip-ref/ceed-hip-ref-basis.c +++ b/backends/hip-ref/ceed-hip-ref-basis.c @@ -192,7 +192,9 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); } // Get read/write access to u, v @@ -218,16 +220,17 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); + CeedCallBackend( + CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); } break; case CEED_EVAL_WEIGHT: case CEED_EVAL_NONE: /* handled separately below */ diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index 5a695761a9..9740700d87 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -74,7 +74,9 @@ typedef struct { hipModule_t moduleAtPoints; CeedInt num_points; hipFunction_t InterpAtPoints; + hipFunction_t InterpTransposeAtPoints; hipFunction_t GradAtPoints; + hipFunction_t GradTransposeAtPoints; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 6d8c858632..b08d1fa271 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -354,7 +354,9 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); } // Get read/write access to u, v @@ -380,16 +382,17 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); + CeedCallBackend( + CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); } break; case CEED_EVAL_WEIGHT: case CEED_EVAL_NONE: /* handled separately below */ diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h index c000b7f873..962c088bc0 100644 --- a/backends/hip-shared/ceed-hip-shared.h +++ b/backends/hip-shared/ceed-hip-shared.h @@ -22,7 +22,9 @@ typedef struct { hipModule_t moduleAtPoints; CeedInt num_points; hipFunction_t InterpAtPoints; + hipFunction_t InterpTransposeAtPoints; hipFunction_t GradAtPoints; + hipFunction_t GradTransposeAtPoints; CeedInt block_sizes[3]; // interp, grad, weight thread block sizes CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h index 2d17b55b2c..134547ecce 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h @@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; @@ -57,124 +57,239 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt const CeedInt P = BASIS_P_1D; const CeedInt Q = BASIS_Q_1D; - const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; - const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; - const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); - const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; // Apply basis element by element - if (is_transpose) { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; - CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; - CeedInt pre = 1; - CeedInt post = 1; - - // Clear Chebyshev coeffs - for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { - s_chebyshev_coeffs[k] = 0.0; - } - - // Map from point + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { - if (p >= points_per_elem[elem]) continue; - pre = 1; - post = 1; - for (CeedInt d = 0; d < BASIS_DIM; d++) { - // Update buffers used - pre /= 1; - const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); - - // Build Chebyshev polynomial values - ChebyshevPolynomialsAtPoint(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); - - // Contract along middle index - for (CeedInt a = 0; a < pre; a++) { - for (CeedInt c = 0; c < post; c++) { - if (d == BASIS_DIM - 1) { - for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); - } else { - for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; - } - } - } - post *= Q; - } + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; } + post *= Q; + } - // Map from coefficients + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { pre = BASIS_NUM_QPTS; post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); // Update buffers used pre /= Q; - const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * P; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % P; - const CeedInt a = k / (post * P); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - if (d == BASIS_DIM - 1) out[k] += v_k; - else out[k] = v_k; + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; + out[a * post + c] = v_k; + } } - post *= P; + post *= 1; } } } - } else { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; - CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; - CeedInt pre = u_size; - CeedInt post = 1; - - // Map to coefficients + } +} + +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } + + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + pre = 1; + post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); // Update buffers used - pre /= P; - const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * Q; + pre /= 1; + const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; - out[k] = v_k; + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (d == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; + } + } } post *= Q; } + } - // Map to point + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; + } + post *= P; + } + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; + const CeedInt u_dim_stride = 0; + const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; + } + post *= Q; + } + + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; + pre = BASIS_NUM_QPTS; post = 1; - for (CeedInt d = 0; d < BASIS_DIM; d++) { + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { // Update buffers used pre /= Q; - const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); + const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values - ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); // Contract along middle index for (CeedInt a = 0; a < pre; a++) { @@ -193,12 +308,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt } } -//------------------------------------------------------------------------------ -// Grad -//------------------------------------------------------------------------------ -extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; @@ -213,147 +325,83 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is const CeedInt P = BASIS_P_1D; const CeedInt Q = BASIS_Q_1D; - const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; - const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; - const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); - const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; - const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; - const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + const CeedInt v_dim_stride = 0; // Apply basis element by element - if (is_transpose) { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; - CeedInt pre = 1; - CeedInt post = 1; - - // Clear Chebyshev coeffs - for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { - s_chebyshev_coeffs[k] = 0.0; - } + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } - // Map from point - __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { - if (p >= points_per_elem[elem]) continue; - for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; - - pre = 1; - post = 1; - for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { - // Update buffers used - pre /= 1; - const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); - CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); - - // Build Chebyshev polynomial values - if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); - else ChebyshevPolynomialsAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); - - // Contract along middle index - for (CeedInt a = 0; a < pre; a++) { - for (CeedInt c = 0; c < post; c++) { - if (dim_2 == BASIS_DIM - 1) { - for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); - } else { - for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; - } + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + + pre = 1; + post = 1; + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { + // Update buffers used + pre /= 1; + const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (dim_2 == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; } } - post *= Q; } + post *= Q; } } - - // Map from coefficients - pre = BASIS_NUM_QPTS; - post = 1; - for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); - // Update buffers used - pre /= Q; - const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * P; - - // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % P; - const CeedInt a = k / (post * P); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - if (d == BASIS_DIM - 1) out[k] += v_k; - else out[k] = v_k; - } - post *= P; - } } - } - } else { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; - CeedInt pre = u_size; - CeedInt post = 1; - - // Map to coefficients - for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); - // Update buffers used - pre /= P; - const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * Q; - // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; - out[k] = v_k; - } - post *= Q; - } - - // Map to point + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { - for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; - - pre = BASIS_NUM_QPTS; - post = 1; - for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { - // Update buffers used - pre /= Q; - const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); - CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); - - // Build Chebyshev polynomial values - if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); - else ChebyshevPolynomialsAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); - - // Contract along middle index - for (CeedInt a = 0; a < pre; a++) { - for (CeedInt c = 0; c < post; c++) { - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; - out[a * post + c] = v_k; - } - } - post *= 1; - } - } + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } + post *= P; } } } diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h index 9ce63a38de..188846386b 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h @@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; @@ -57,124 +57,239 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt const CeedInt P = BASIS_P_1D; const CeedInt Q = BASIS_Q_1D; - const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; - const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; - const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); - const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; // Apply basis element by element - if (is_transpose) { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; - CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; - CeedInt pre = 1; - CeedInt post = 1; - - // Clear Chebyshev coeffs - for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { - s_chebyshev_coeffs[k] = 0.0; - } - - // Map from point + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { - if (p >= points_per_elem[elem]) continue; - pre = 1; - post = 1; - for (CeedInt d = 0; d < BASIS_DIM; d++) { - // Update buffers used - pre /= 1; - const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); - - // Build Chebyshev polynomial values - ChebyshevPolynomialsAtPoint(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); - - // Contract along middle index - for (CeedInt a = 0; a < pre; a++) { - for (CeedInt c = 0; c < post; c++) { - if (d == BASIS_DIM - 1) { - for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); - } else { - for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; - } - } - } - post *= Q; - } + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; } + post *= Q; + } - // Map from coefficients + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { pre = BASIS_NUM_QPTS; post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); // Update buffers used pre /= Q; - const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * P; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % P; - const CeedInt a = k / (post * P); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - if (d == BASIS_DIM - 1) out[k] += v_k; - else out[k] = v_k; + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; + out[a * post + c] = v_k; + } } - post *= P; + post *= 1; } } } - } else { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; - CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; - CeedInt pre = u_size; - CeedInt post = 1; - - // Map to coefficients + } +} + +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } + + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + pre = 1; + post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); // Update buffers used - pre /= P; - const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * Q; + pre /= 1; + const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + ChebyshevPolynomialsAtPoint(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; - out[k] = v_k; + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (d == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; + } + } } post *= Q; } + } - // Map to point + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; + } + post *= P; + } + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt i = threadIdx.x; + + __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; + CeedScalar *s_chebyshev_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; + CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; + for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { + s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; + } + + const CeedInt P = BASIS_P_1D; + const CeedInt Q = BASIS_Q_1D; + const CeedInt u_stride = BASIS_NUM_NODES; + const CeedInt v_stride = BASIS_NUM_PTS; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt u_size = BASIS_NUM_NODES; + const CeedInt u_dim_stride = 0; + const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; + CeedInt pre = u_size; + CeedInt post = 1; + + // Map to coefficients + for (CeedInt d = 0; d < BASIS_DIM; d++) { + __syncthreads(); + // Update buffers used + pre /= P; + const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * Q; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; + out[k] = v_k; + } + post *= Q; + } + + // Map to point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; + pre = BASIS_NUM_QPTS; post = 1; - for (CeedInt d = 0; d < BASIS_DIM; d++) { + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { // Update buffers used pre /= Q; - const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); + const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2); // Build Chebyshev polynomial values - ChebyshevPolynomialsAtPoint(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); // Contract along middle index for (CeedInt a = 0; a < pre; a++) { @@ -193,12 +308,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt } } -//------------------------------------------------------------------------------ -// Grad -//------------------------------------------------------------------------------ -extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; @@ -213,147 +325,83 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is const CeedInt P = BASIS_P_1D; const CeedInt Q = BASIS_Q_1D; - const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; - const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; - const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); - const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); - const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; - const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; - const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + const CeedInt u_stride = BASIS_NUM_PTS; + const CeedInt v_stride = BASIS_NUM_NODES; + const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; + const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; + const CeedInt u_size = BASIS_NUM_PTS; + const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; + const CeedInt v_dim_stride = 0; // Apply basis element by element - if (is_transpose) { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; - CeedInt pre = 1; - CeedInt post = 1; - - // Clear Chebyshev coeffs - for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { - s_chebyshev_coeffs[k] = 0.0; - } + for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; + CeedInt pre = 1; + CeedInt post = 1; + + // Clear Chebyshev coeffs + for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { + s_chebyshev_coeffs[k] = 0.0; + } - // Map from point - __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { - if (p >= points_per_elem[elem]) continue; - for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; - - pre = 1; - post = 1; - for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { - // Update buffers used - pre /= 1; - const CeedScalar *in = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1); - CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); - - // Build Chebyshev polynomial values - if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); - else ChebyshevPolynomialsAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); - - // Contract along middle index - for (CeedInt a = 0; a < pre; a++) { - for (CeedInt c = 0; c < post; c++) { - if (dim_2 == BASIS_DIM - 1) { - for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); - } else { - for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; - } + // Map from point + __syncthreads(); + for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; + for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { + const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; + + pre = 1; + post = 1; + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { + // Update buffers used + pre /= 1; + const CeedScalar *in = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); + + // Build Chebyshev polynomial values + if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + else ChebyshevPolynomialsAtPoint(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); + + // Contract along middle index + for (CeedInt a = 0; a < pre; a++) { + for (CeedInt c = 0; c < post; c++) { + if (dim_2 == BASIS_DIM - 1) { + for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); + } else { + for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; } } - post *= Q; } + post *= Q; } } - - // Map from coefficients - pre = BASIS_NUM_QPTS; - post = 1; - for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); - // Update buffers used - pre /= Q; - const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * P; - - // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % P; - const CeedInt a = k / (post * P); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - if (d == BASIS_DIM - 1) out[k] += v_k; - else out[k] = v_k; - } - post *= P; - } } - } - } else { - for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { - for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; - CeedInt pre = u_size; - CeedInt post = 1; - - // Map to coefficients - for (CeedInt d = 0; d < BASIS_DIM; d++) { - __syncthreads(); - // Update buffers used - pre /= P; - const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); - const CeedInt writeLen = pre * post * Q; - // Contract along middle index - for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; - out[k] = v_k; - } - post *= Q; - } - - // Map to point + // Map from coefficients + pre = BASIS_NUM_QPTS; + post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); - for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { - for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; - - pre = BASIS_NUM_QPTS; - post = 1; - for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { - // Update buffers used - pre /= Q; - const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); - CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2); - - // Build Chebyshev polynomial values - if (dim_1 == dim_2) ChebyshevDerivativeAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); - else ChebyshevPolynomialsAtPoint(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); - - // Contract along middle index - for (CeedInt a = 0; a < pre; a++) { - for (CeedInt c = 0; c < post; c++) { - CeedScalar v_k = 0; - - for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; - out[a * post + c] = v_k; - } - } - post *= 1; - } - } + // Update buffers used + pre /= Q; + const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedInt writeLen = pre * post * P; + + // Contract along middle index + for (CeedInt k = i; k < writeLen; k += blockDim.x) { + const CeedInt c = k % post; + const CeedInt j = (k / post) % P; + const CeedInt a = k / (post * P); + CeedScalar v_k = 0; + + for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } + post *= P; } } }