diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index d7ab9a4aae..544a5cb188 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -194,7 +194,9 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                      max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -220,16 +222,17 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 0f6ca9d1cb..582b43a975 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -70,7 +70,9 @@ typedef struct {
   CUmodule    moduleAtPoints;
   CeedInt     num_points;
   CUfunction  InterpAtPoints;
+  CUfunction  InterpTransposeAtPoints;
   CUfunction  GradAtPoints;
+  CUfunction  GradTransposeAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_q_weight_1d;
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index bd2467e538..67f3d7ac7c 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -295,7 +295,9 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
                                      "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                      max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -321,16 +323,17 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index d70d75ab94..f42f2b1cff 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -22,7 +22,9 @@ typedef struct {
   CUmodule    moduleAtPoints;
   CeedInt     num_points;
   CUfunction  InterpAtPoints;
+  CUfunction  InterpTransposeAtPoints;
   CUfunction  GradAtPoints;
+  CUfunction  GradTransposeAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_collo_grad_1d;
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 8fb3d3fa20..7fdfb9f16d 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -192,7 +192,9 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -218,16 +220,17 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 5a695761a9..9740700d87 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -74,7 +74,9 @@ typedef struct {
   hipModule_t   moduleAtPoints;
   CeedInt       num_points;
   hipFunction_t InterpAtPoints;
+  hipFunction_t InterpTransposeAtPoints;
   hipFunction_t GradAtPoints;
+  hipFunction_t GradTransposeAtPoints;
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_q_weight_1d;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 6d8c858632..b08d1fa271 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -354,7 +354,9 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
   }
 
   // Get read/write access to u, v
@@ -380,16 +382,17 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(
+          CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
+      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index c000b7f873..962c088bc0 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -22,7 +22,9 @@ typedef struct {
   hipModule_t   moduleAtPoints;
   CeedInt       num_points;
   hipFunction_t InterpAtPoints;
+  hipFunction_t InterpTransposeAtPoints;
   hipFunction_t GradAtPoints;
+  hipFunction_t GradTransposeAtPoints;
   CeedInt       block_sizes[3];  // interp, grad, weight thread block sizes
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 2d17b55b2c..134547ecce 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
@@ -57,124 +57,239 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = 1;
-        CeedInt           post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
-
-        // Map from point
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          pre  = 1;
-          post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
-            // Update buffers used
-            pre /= 1;
-            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
-
-            // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
-
-            // Contract along middle index
-            for (CeedInt a = 0; a < pre; a++) {
-              for (CeedInt c = 0; c < post; c++) {
-                if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                } else {
-                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                }
-              }
-            }
-            post *= Q;
-          }
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
         }
+        post *= Q;
+      }
 
-        // Map from coefficients
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
         pre  = BASIS_NUM_QPTS;
         post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
           pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
+          const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              CeedScalar v_k = 0;
+
+              for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+              out[a * post + c] = v_k;
+            }
           }
-          post *= P;
+          post *= 1;
         }
       }
     }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = 1;
+      CeedInt           post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        pre  = 1;
+        post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
+          pre /= 1;
+          const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              if (d == BASIS_DIM - 1) {
+                for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+              } else {
+                for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+              }
+            }
           }
           post *= Q;
         }
+      }
 
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = 0;
+  const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
+
           pre  = BASIS_NUM_QPTS;
           post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
             // Update buffers used
             pre /= Q;
-            const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+            const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
 
             // Contract along middle index
             for (CeedInt a = 0; a < pre; a++) {
@@ -193,12 +308,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   }
 }
 
-//------------------------------------------------------------------------------
-// Grad
-//------------------------------------------------------------------------------
-extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
-                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -213,147 +325,83 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+  const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt v_dim_stride  = 0;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt     pre   = 1;
-        CeedInt     post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt     pre   = 1;
+      CeedInt     post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
 
-        // Map from point
-        __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
-
-            pre  = 1;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= 1;
-              const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                  } else {
-                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                  }
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+
+          pre  = 1;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (dim_2 == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
               }
-              post *= Q;
             }
+            post *= Q;
           }
         }
-
-        // Map from coefficients
-        pre  = BASIS_NUM_QPTS;
-        post = 1;
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
-
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
-          }
-          post *= P;
-        }
       }
-    }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
 
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
-          }
-          post *= Q;
-        }
-
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
-
-            pre  = BASIS_NUM_QPTS;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= Q;
-              const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  CeedScalar v_k = 0;
-
-                  for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
-                  out[a * post + c] = v_k;
-                }
-              }
-              post *= 1;
-            }
-          }
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
+        post *= P;
       }
     }
   }
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 9ce63a38de..188846386b 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
@@ -57,124 +57,239 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = 1;
-        CeedInt           post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
-
-        // Map from point
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          pre  = 1;
-          post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
-            // Update buffers used
-            pre /= 1;
-            const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
-
-            // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
-
-            // Contract along middle index
-            for (CeedInt a = 0; a < pre; a++) {
-              for (CeedInt c = 0; c < post; c++) {
-                if (d == BASIS_DIM - 1) {
-                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                } else {
-                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                }
-              }
-            }
-            post *= Q;
-          }
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
         }
+        post *= Q;
+      }
 
-        // Map from coefficients
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
         pre  = BASIS_NUM_QPTS;
         post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
           pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
+          const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              CeedScalar v_k = 0;
+
+              for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+              out[a * post + c] = v_k;
+            }
           }
-          post *= P;
+          post *= 1;
         }
       }
     }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = 1;
+      CeedInt           post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        pre  = 1;
+        post = 1;
         for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
           // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
+          pre /= 1;
+          const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
 
           // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              if (d == BASIS_DIM - 1) {
+                for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+              } else {
+                for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+              }
+            }
           }
           post *= Q;
         }
+      }
 
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = 0;
+  const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
+
           pre  = BASIS_NUM_QPTS;
           post = 1;
-          for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
             // Update buffers used
             pre /= Q;
-            const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
-            CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+            const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
 
             // Build Chebyshev polynomial values
-            ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
 
             // Contract along middle index
             for (CeedInt a = 0; a < pre; a++) {
@@ -193,12 +308,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
   }
 }
 
-//------------------------------------------------------------------------------
-// Grad
-//------------------------------------------------------------------------------
-extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
-                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -213,147 +325,83 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
 
   const CeedInt P             = BASIS_P_1D;
   const CeedInt Q             = BASIS_Q_1D;
-  const CeedInt u_stride      = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS;
-  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS);
-  const CeedInt u_size        = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES;
-  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+  const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt v_dim_stride  = 0;
 
   // Apply basis element by element
-  if (is_transpose) {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
-        CeedInt     pre   = 1;
-        CeedInt     post  = 1;
-
-        // Clear Chebyshev coeffs
-        for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
-          s_chebyshev_coeffs[k] = 0.0;
-        }
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt     pre   = 1;
+      CeedInt     post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
 
-        // Map from point
-        __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          if (p >= points_per_elem[elem]) continue;
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
-
-            pre  = 1;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= 1;
-              const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  if (dim_2 == BASIS_DIM - 1) {
-                    for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
-                  } else {
-                    for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
-                  }
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+
+          pre  = 1;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (dim_2 == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
                 }
               }
-              post *= Q;
             }
+            post *= Q;
           }
         }
-
-        // Map from coefficients
-        pre  = BASIS_NUM_QPTS;
-        post = 1;
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= Q;
-          const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * P;
-
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % P;
-            const CeedInt a   = k / (post * P);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
-            if (d == BASIS_DIM - 1) out[k] += v_k;
-            else out[k] = v_k;
-          }
-          post *= P;
-        }
       }
-    }
-  } else {
-    for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
-      for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-        const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
-        CeedInt           pre   = u_size;
-        CeedInt           post  = 1;
-
-        // Map to coefficients
-        for (CeedInt d = 0; d < BASIS_DIM; d++) {
-          __syncthreads();
-          // Update buffers used
-          pre /= P;
-          const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
-          CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
-          const CeedInt     writeLen = pre * post * Q;
 
-          // Contract along middle index
-          for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-            const CeedInt c   = k % post;
-            const CeedInt j   = (k / post) % Q;
-            const CeedInt a   = k / (post * Q);
-            CeedScalar    v_k = 0;
-
-            for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
-            out[k] = v_k;
-          }
-          post *= Q;
-        }
-
-        // Map to point
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
-        for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
-          for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-            CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
-
-            pre  = BASIS_NUM_QPTS;
-            post = 1;
-            for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
-              // Update buffers used
-              pre /= Q;
-              const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
-              CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
-
-              // Build Chebyshev polynomial values
-              if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-              else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
-
-              // Contract along middle index
-              for (CeedInt a = 0; a < pre; a++) {
-                for (CeedInt c = 0; c < post; c++) {
-                  CeedScalar v_k = 0;
-
-                  for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
-                  out[a * post + c] = v_k;
-                }
-              }
-              post *= 1;
-            }
-          }
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
+        post *= P;
       }
     }
   }