From 00af22e9d867e7153717b213af2d282e02116ebf Mon Sep 17 00:00:00 2001
From: krickw <william.krick@autodesk.com>
Date: Thu, 3 Dec 2020 14:50:47 -0500
Subject: [PATCH 1/5] Prototype support for GPU Normals and OSD using glsl. The
 code is disabled by default.

---
 cmake/modules/FindMaya.cmake                  |    3 +-
 lib/mayaUsd/render/CMakeLists.txt             |    1 +
 .../render/vp2ComputeShaders/CMakeLists.txt   |   13 +
 .../vp2ComputeShaders/computeNormals.cl       |   34 +
 .../vp2ComputeShaders/computeNormals.glsl     |   69 +
 .../render/vp2ComputeShaders/plugInfo.json    |   11 +
 .../render/vp2RenderDelegate/CMakeLists.txt   |    1 +
 lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp |  410 +++--
 lib/mayaUsd/render/vp2RenderDelegate/mesh.h   |   35 +-
 .../vp2RenderDelegate/meshViewportCompute.cpp | 1335 +++++++++++++++++
 .../vp2RenderDelegate/meshViewportCompute.h   |  261 ++++
 11 files changed, 2075 insertions(+), 98 deletions(-)
 create mode 100644 lib/mayaUsd/render/vp2ComputeShaders/CMakeLists.txt
 create mode 100644 lib/mayaUsd/render/vp2ComputeShaders/computeNormals.cl
 create mode 100644 lib/mayaUsd/render/vp2ComputeShaders/computeNormals.glsl
 create mode 100644 lib/mayaUsd/render/vp2ComputeShaders/plugInfo.json
 create mode 100644 lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp
 create mode 100644 lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h

diff --git a/cmake/modules/FindMaya.cmake b/cmake/modules/FindMaya.cmake
index 7d8f9ad29a..ed825217a4 100644
--- a/cmake/modules/FindMaya.cmake
+++ b/cmake/modules/FindMaya.cmake
@@ -207,7 +207,8 @@ foreach(MAYA_LIB
     IMFbase
     tbb
     cg
-    cgGL)
+    cgGL
+    clew)
 
     find_library(MAYA_${MAYA_LIB}_LIBRARY
             ${MAYA_LIB}
diff --git a/lib/mayaUsd/render/CMakeLists.txt b/lib/mayaUsd/render/CMakeLists.txt
index 4c6902a784..08d0cf135e 100644
--- a/lib/mayaUsd/render/CMakeLists.txt
+++ b/lib/mayaUsd/render/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(px_vp20)
 add_subdirectory(pxrUsdMayaGL)
+add_subDirectory(vp2ComputeShaders)
 add_subdirectory(vp2RenderDelegate)
 add_subdirectory(vp2ShaderFragments)
diff --git a/lib/mayaUsd/render/vp2ComputeShaders/CMakeLists.txt b/lib/mayaUsd/render/vp2ComputeShaders/CMakeLists.txt
new file mode 100644
index 0000000000..db6b641dcf
--- /dev/null
+++ b/lib/mayaUsd/render/vp2ComputeShaders/CMakeLists.txt
@@ -0,0 +1,13 @@
+# -----------------------------------------------------------------------------
+# install
+# -----------------------------------------------------------------------------
+list(APPEND MAYAUSD_COMPUTESHADERS
+    computeNormals.glsl
+    computeNormals.cl
+    plugInfo.json
+)
+
+install(FILES ${MAYAUSD_COMPUTESHADERS}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/usd/mayaUSD_ComputeShaders/resources
+)
+
diff --git a/lib/mayaUsd/render/vp2ComputeShaders/computeNormals.cl b/lib/mayaUsd/render/vp2ComputeShaders/computeNormals.cl
new file mode 100644
index 0000000000..11b9a2e050
--- /dev/null
+++ b/lib/mayaUsd/render/vp2ComputeShaders/computeNormals.cl
@@ -0,0 +1,34 @@
+__kernel void computeNormals(
+    __global const float* positions,
+    const unsigned int    vertexCount, // number of posisions and normals
+                                       /*  Adjacency buffer is two distinct parts.
+                                           First, two ints per vertex the offset and the valence. The valence is the number of adjacent
+                                          vertices. The offset is the offset into the adjacency buffer to find the vertex ids of the
+                                          adjacent vertices.                                    Next, a list of vertex ids of the
+                                          adjacent vertices for each vertex, found                                    using the information from the first part of the
+                                          buffer.
+                                       */
+    __global const int* adjacency,
+    __global float*     normals)
+{
+    unsigned int vertexId = get_global_id(0);
+    if (vertexId >= vertexCount)
+        return;
+
+    unsigned int  offsetIdx = vertexId * 2;
+    int           offset = adjacency[offsetIdx];
+    int           valence = adjacency[offsetIdx + 1];
+    __global int* currAdj = &adjacency[offset];
+
+    const float3 currVertex = vload3(vertexId, positions);
+    float3       accumulatedNormal = (float3)(0.0f, 0.0f, 0.0f);
+
+    for (int neighbour = 0; neighbour < valence; neighbour++) {
+        float3 prevVertex = vload3(*currAdj++, positions);
+        float3 nextVertex = vload3(*currAdj++, positions);
+        accumulatedNormal += cross(nextVertex - currVertex, prevVertex - currVertex);
+    }
+
+    accumulatedNormal = normalize(accumulatedNormal);
+    vstore3(accumulatedNormal, vertexId, normals);
+}
diff --git a/lib/mayaUsd/render/vp2ComputeShaders/computeNormals.glsl b/lib/mayaUsd/render/vp2ComputeShaders/computeNormals.glsl
new file mode 100644
index 0000000000..7778d04982
--- /dev/null
+++ b/lib/mayaUsd/render/vp2ComputeShaders/computeNormals.glsl
@@ -0,0 +1,69 @@
+#version 430
+
+layout( std140, binding=0 ) uniform Values
+{
+	uint VertexCount;
+};
+
+// This is a float3 but for buffer layout to be correct use float
+layout( std430, binding=1 ) buffer Pos
+{
+	float Positions[ ];
+};
+
+layout( std430, binding=2 ) buffer Adj
+{
+	int Adjacency[ ];
+};
+
+layout( std430, binding=3 ) buffer RtoS
+{
+	int RenderingToScene[ ];
+};
+
+layout( std430, binding=4 ) buffer StoR
+{
+	int SceneToRendering[ ];
+};
+
+// This is a float3 but for buffer layout to be correct use float
+layout( std430, binding=5 ) buffer Norm
+{
+	float Normals[ ];
+};
+
+layout( local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+	uint renderingVertexId = gl_GlobalInvocationID.x;
+	uint renderingVertexOffset = renderingVertexId *3;
+
+	if (renderingVertexId < VertexCount)
+	{
+		uint sceneVertexId = RenderingToScene[renderingVertexId];
+
+		uint adjOffsetIdx = sceneVertexId*2;
+		int offset = Adjacency[adjOffsetIdx];
+		int valence = Adjacency[adjOffsetIdx+1];
+
+		vec3 currVertex = vec3(Positions[renderingVertexOffset], Positions[renderingVertexOffset+1], Positions[renderingVertexOffset+2]);
+		vec3 accumulatedNormal = vec3(0.0, 0.0, 0.0);
+
+		for (int neighbour=0; neighbour<valence; neighbour++)
+		{
+			int prevVertexOffset = SceneToRendering[Adjacency[offset++]] * 3;
+			vec3 prevVertex = vec3(Positions[prevVertexOffset], Positions[prevVertexOffset+1], Positions[prevVertexOffset+2]);
+			int nextVertexOffset = SceneToRendering[Adjacency[offset++]] * 3;
+			vec3 nextVertex = vec3(Positions[nextVertexOffset], Positions[nextVertexOffset+1], Positions[nextVertexOffset+2]);
+			accumulatedNormal += cross(nextVertex - currVertex, prevVertex - currVertex);
+		}
+
+		accumulatedNormal = normalize(accumulatedNormal);
+
+		Normals[renderingVertexOffset] = accumulatedNormal.x;
+		Normals[renderingVertexOffset+1] = accumulatedNormal.y;
+		Normals[renderingVertexOffset+2] = accumulatedNormal.z;
+	}
+}
+
+
diff --git a/lib/mayaUsd/render/vp2ComputeShaders/plugInfo.json b/lib/mayaUsd/render/vp2ComputeShaders/plugInfo.json
new file mode 100644
index 0000000000..29709762aa
--- /dev/null
+++ b/lib/mayaUsd/render/vp2ComputeShaders/plugInfo.json
@@ -0,0 +1,11 @@
+{
+    "Plugins": [
+        {
+            "Name": "mayaUsd_ComputeShaders", 
+            "Info": {}, 
+            "ResourcePath": "../../usd/mayaUsd_ComputeShaders/resources", 
+            "Root": "..", 
+            "Type": "resource"
+        }
+    ]
+}
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/CMakeLists.txt b/lib/mayaUsd/render/vp2RenderDelegate/CMakeLists.txt
index 3578b4b2f6..b600c0858f 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/CMakeLists.txt
+++ b/lib/mayaUsd/render/vp2RenderDelegate/CMakeLists.txt
@@ -10,6 +10,7 @@ target_sources(${PROJECT_NAME}
         instancer.cpp
         material.cpp
         mesh.cpp
+        meshViewportCompute.cpp
         proxyRenderDelegate.cpp
         render_delegate.cpp
         render_param.cpp
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
index 5653593f8b..32a4db7cec 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
@@ -327,6 +327,21 @@ void setWantConsolidation(MHWRender::MRenderItem& renderItem, bool state)
 }
 } // namespace
 
+void HdVP2Mesh::_InitGPUCompute()
+{
+    // check that the viewport is using OpenGL, we need it for the OpenGL normals computation
+    MRenderer* renderer = MRenderer::theRenderer();
+    // would also be nice to check the openGL version but renderer->drawAPIVersion() returns 4.
+    // Compute was added in 4.3 so I don't have enough information to make the check
+    if (renderer && renderer->drawAPIIsOpenGL()
+        && (TfGetenvInt("HDVP2_USE_GPU_NORMAL_COMPUTATION", 0) > 0))
+        _gpuNormalsComputeThreshold
+            = TfGetenvInt("HDVP2_GPU_NORMAL_COMPUTATION_MINIMUM_THRESHOLD", 8000);
+    else
+        _gpuNormalsComputeThreshold = INT_MAX;
+}
+
+int HdVP2Mesh::_gpuNormalsComputeThreshold = -1;
 //! \brief  Constructor
 #if defined(HD_API_VERSION) && HD_API_VERSION >= 36
 HdVP2Mesh::HdVP2Mesh(HdVP2RenderDelegate* delegate, const SdfPath& id)
@@ -338,9 +353,16 @@ HdVP2Mesh::HdVP2Mesh(HdVP2RenderDelegate* delegate, const SdfPath& id, const Sdf
     , _delegate(delegate)
     , _rprimId(id.GetText())
 {
+    _meshSharedData = std::make_shared<HdVP2MeshSharedData>();
     const MHWRender::MVertexBufferDescriptor vbDesc(
         "", MHWRender::MGeometry::kPosition, MHWRender::MGeometry::kFloat, 3);
-    _meshSharedData._positionsBuffer.reset(new MHWRender::MVertexBuffer(vbDesc));
+    _meshSharedData->_positionsBuffer.reset(new MHWRender::MVertexBuffer(vbDesc));
+    // HdChangeTracker::IsVarying() can check dirty bits to tell us if an object is animated or not.
+    // Not sure if it is correct on file load
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+    static std::once_flag initGPUComputeOnce;
+    std::call_once(initGPUComputeOnce, _InitGPUCompute);
+#endif
 }
 
 //! \brief  Synchronize VP2 state with scene delegate state based on dirty bits and representation
@@ -398,18 +420,28 @@ void HdVP2Mesh::Sync(
     }
 
     if (HdChangeTracker::IsTopologyDirty(*dirtyBits, id)) {
-        _meshSharedData._topology = GetMeshTopology(delegate);
+        _meshSharedData->_topology = GetMeshTopology(delegate);
 
-        const HdMeshTopology& topology = _meshSharedData._topology;
+        const HdMeshTopology& topology = _meshSharedData->_topology;
         const VtIntArray&     faceVertexIndices = topology.GetFaceVertexIndices();
         const size_t          numFaceVertexIndices = faceVertexIndices.size();
 
         VtIntArray newFaceVertexIndices;
         newFaceVertexIndices.resize(numFaceVertexIndices);
 
-        if (_IsUnsharedVertexLayoutRequired(_meshSharedData._primvarSourceMap)) {
-            _meshSharedData._numVertices = numFaceVertexIndices;
-            _meshSharedData._renderingToSceneFaceVtxIds = faceVertexIndices;
+        if (_IsUnsharedVertexLayoutRequired(_meshSharedData->_primvarSourceMap)) {
+            _meshSharedData->_numVertices = numFaceVertexIndices;
+            _meshSharedData->_renderingToSceneFaceVtxIds = faceVertexIndices;
+            _meshSharedData->_sceneToRenderingFaceVtxIds.clear();
+            _meshSharedData->_sceneToRenderingFaceVtxIds.resize(topology.GetNumPoints(), -1);
+
+            for (size_t i = 0; i < numFaceVertexIndices; i++) {
+                const int sceneFaceVtxId = faceVertexIndices[i];
+                _meshSharedData->_sceneToRenderingFaceVtxIds[sceneFaceVtxId]
+                    = i; // could check if the existing value is -1, but it doesn't matter. we just
+                         // need to map to a vertex in the position buffer that has the correct
+                         // value.
+            }
 
             // Fill with sequentially increasing values, starting from 0. The new
             // face vertex indices will be used to populate index data for unshared
@@ -419,31 +451,37 @@ void HdVP2Mesh::Sync(
             // should update _FillPrimvarData() code to remap indices correctly.
             std::iota(newFaceVertexIndices.begin(), newFaceVertexIndices.end(), 0);
         } else {
-            _meshSharedData._numVertices = topology.GetNumPoints();
-            _meshSharedData._renderingToSceneFaceVtxIds.clear();
+            _meshSharedData->_numVertices = topology.GetNumPoints();
+            _meshSharedData->_renderingToSceneFaceVtxIds.clear();
 
             // Allocate large enough memory with initial value of -1 to indicate
             // the rendering face vertex index is not determined yet.
-            std::vector<int> authorToRenderFaceVtxIds(numFaceVertexIndices, -1);
+            _meshSharedData->_sceneToRenderingFaceVtxIds.clear();
+            _meshSharedData->_sceneToRenderingFaceVtxIds.resize(numFaceVertexIndices, -1);
+            unsigned int sceneToRenderingFaceVtxIdsCount = 0;
 
             // Sort vertices to avoid drastically jumping indices. Cache efficiency
             // is important to fast rendering performance for dense mesh.
             for (size_t i = 0; i < numFaceVertexIndices; i++) {
-                const int authorFaceVtxId = faceVertexIndices[i];
+                const int sceneFaceVtxId = faceVertexIndices[i];
 
-                int renderFaceVtxId = authorToRenderFaceVtxIds[authorFaceVtxId];
+                int renderFaceVtxId = _meshSharedData->_sceneToRenderingFaceVtxIds[sceneFaceVtxId];
                 if (renderFaceVtxId < 0) {
-                    renderFaceVtxId = _meshSharedData._renderingToSceneFaceVtxIds.size();
-                    _meshSharedData._renderingToSceneFaceVtxIds.push_back(authorFaceVtxId);
+                    renderFaceVtxId = _meshSharedData->_renderingToSceneFaceVtxIds.size();
+                    _meshSharedData->_renderingToSceneFaceVtxIds.push_back(sceneFaceVtxId);
 
-                    authorToRenderFaceVtxIds[authorFaceVtxId] = renderFaceVtxId;
+                    _meshSharedData->_sceneToRenderingFaceVtxIds[sceneFaceVtxId] = renderFaceVtxId;
+                    sceneToRenderingFaceVtxIdsCount++;
                 }
 
                 newFaceVertexIndices[i] = renderFaceVtxId;
             }
+
+            _meshSharedData->_sceneToRenderingFaceVtxIds.resize(
+                sceneToRenderingFaceVtxIdsCount); // drop any extra -1 values.
         }
 
-        _meshSharedData._renderingTopology = HdMeshTopology(
+        _meshSharedData->_renderingTopology = HdMeshTopology(
             topology.GetScheme(),
             topology.GetOrientation(),
             topology.GetFaceVertexCounts(),
@@ -456,39 +494,46 @@ void HdVP2Mesh::Sync(
     // be updated only once when it gets dirty.
     if (HdChangeTracker::IsPrimvarDirty(*dirtyBits, id, HdTokens->points)) {
         const VtValue value = delegate->Get(id, HdTokens->points);
-        _meshSharedData._points = value.Get<VtVec3fArray>();
+        _meshSharedData->_points = value.Get<VtVec3fArray>();
 
-        const HdMeshTopology& topology = _meshSharedData._topology;
-        const size_t          numVertices = _meshSharedData._numVertices;
-
-        void* bufferData = _meshSharedData._positionsBuffer->acquire(numVertices, true);
-        if (bufferData) {
-            _FillPrimvarData(
-                static_cast<GfVec3f*>(bufferData),
-                numVertices,
-                0,
-                _meshSharedData._renderingToSceneFaceVtxIds,
-                _rprimId,
-                topology,
-                HdTokens->points,
-                _meshSharedData._points,
-                HdInterpolationVertex);
-
-            // Capture class member for lambda
-            MHWRender::MVertexBuffer* const positionsBuffer
-                = _meshSharedData._positionsBuffer.get();
-            const MString& rprimId = _rprimId;
+        const HdMeshTopology& topology = _meshSharedData->_topology;
+        const size_t          numVertices = _meshSharedData->_numVertices;
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+        _gpuNormalsEnabled = _gpuNormalsEnabled && numVertices >= _gpuNormalsComputeThreshold;
+#else
+        _gpuNormalsEnabled = false;
+#endif
 
-            _delegate->GetVP2ResourceRegistry().EnqueueCommit(
-                [positionsBuffer, bufferData, rprimId]() {
-                    MProfilingScope profilingScope(
-                        HdVP2RenderDelegate::sProfilerCategory,
-                        MProfiler::kColorC_L2,
-                        rprimId.asChar(),
-                        "CommitPositions");
+        if (numVertices > 0) {
+            void* bufferData = _meshSharedData->_positionsBuffer->acquire(numVertices, true);
+            if (bufferData) {
+                _FillPrimvarData(
+                    static_cast<GfVec3f*>(bufferData),
+                    numVertices,
+                    0,
+                    _meshSharedData->_renderingToSceneFaceVtxIds,
+                    _rprimId,
+                    topology,
+                    HdTokens->points,
+                    _meshSharedData->_points,
+                    HdInterpolationVertex);
+
+                // Capture class member for lambda
+                MHWRender::MVertexBuffer* const positionsBuffer
+                    = _meshSharedData->_positionsBuffer.get();
+                const MString& rprimId = _rprimId;
+
+                _delegate->GetVP2ResourceRegistry().EnqueueCommit(
+                    [positionsBuffer, bufferData, rprimId]() {
+                        MProfilingScope profilingScope(
+                            HdVP2RenderDelegate::sProfilerCategory,
+                            MProfiler::kColorC_L2,
+                            rprimId.asChar(),
+                            "CommitPositions");
 
-                    positionsBuffer->commit(bufferData);
-                });
+                        positionsBuffer->commit(bufferData);
+                    });
+            }
         }
     }
 
@@ -510,7 +555,7 @@ void HdVP2Mesh::Sync(
            | HdChangeTracker::DirtyVisibility
 #endif
            )) {
-        _meshSharedData._renderTag = delegate->GetRenderTag(id);
+        _meshSharedData->_renderTag = delegate->GetRenderTag(id);
     }
 
     *dirtyBits = HdChangeTracker::Clean;
@@ -848,20 +893,27 @@ void HdVP2Mesh::_UpdateDrawItem(
 
     const HdRenderIndex& renderIndex = sceneDelegate->GetRenderIndex();
 
-    const HdMeshTopology& topology = _meshSharedData._topology;
-    const auto&           primvarSourceMap = _meshSharedData._primvarSourceMap;
-    const size_t          numVertices = _meshSharedData._numVertices;
+    const HdMeshTopology& topology = _meshSharedData->_topology;
+    const auto&           primvarSourceMap = _meshSharedData->_primvarSourceMap;
+    const size_t          numVertices = _meshSharedData->_numVertices;
 
     // The bounding box item uses a globally-shared geometry data therefore it
     // doesn't need to extract index data from topology. Points use non-indexed
     // draw.
     const bool isBBoxItem = (renderItem->drawMode() == MHWRender::MGeometry::kBoundingBox);
     const bool isPointSnappingItem = (renderItem->primitive() == MHWRender::MGeometry::kPoints);
+#ifdef HDVP2_ENABLE_GPU_OSD
+    const bool isLineItem = (renderItem->primitive() == MHWRender::MGeometry::kLines);
+    // when we do OSD we don't bother creating indexing until after we have a smooth mesh
+    const bool requiresIndexUpdate
+        = !isBBoxItem && !isPointSnappingItem && (!_gpuOSDEnabled || isLineItem);
+#else
     const bool requiresIndexUpdate = !isBBoxItem && !isPointSnappingItem;
+#endif
 
     // Prepare index buffer.
     if (requiresIndexUpdate && (itemDirtyBits & HdChangeTracker::DirtyTopology)) {
-        const HdMeshTopology& topologyToUse = _meshSharedData._renderingTopology;
+        const HdMeshTopology& topologyToUse = _meshSharedData->_renderingTopology;
 
         if (desc.geomStyle == HdMeshGeomStyleHull) {
             HdMeshUtil   meshUtil(&topologyToUse, id);
@@ -871,19 +923,31 @@ void HdVP2Mesh::_UpdateDrawItem(
 
             const int numIndex = trianglesFaceVertexIndices.size() * 3;
 
-            stateToCommit._indexBufferData
-                = static_cast<int*>(drawItemData._indexBuffer->acquire(numIndex, true));
-
-            memcpy(
-                stateToCommit._indexBufferData,
-                trianglesFaceVertexIndices.data(),
-                numIndex * sizeof(int));
+            stateToCommit._indexBufferData = numIndex > 0
+                ? static_cast<int*>(drawItemData._indexBuffer->acquire(numIndex, true))
+                : nullptr;
+            if (stateToCommit._indexBufferData)
+                memcpy(
+                    stateToCommit._indexBufferData,
+                    trianglesFaceVertexIndices.data(),
+                    numIndex * sizeof(int));
+
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+            if (requireSmoothNormals && (_gpuNormalsEnabled || _gpuOSDEnabled)) {
+                // these function only do something if HDVP2_ENABLE_GPU_COMPUTE or
+                // HDVP2_ENABLE_GPU_OSD is defined
+                _CreateViewportCompute(*drawItem);
+#ifdef HDVP2_ENABLE_GPU_OSD
+                _CreateOSDTables();
+#endif
+            }
+#endif
         } else if (desc.geomStyle == HdMeshGeomStyleHullEdgeOnly) {
             unsigned int numIndex = _GetNumOfEdgeIndices(topologyToUse);
 
-            stateToCommit._indexBufferData
-                = static_cast<int*>(drawItemData._indexBuffer->acquire(numIndex, true));
-
+            stateToCommit._indexBufferData = numIndex
+                ? static_cast<int*>(drawItemData._indexBuffer->acquire(numIndex, true))
+                : nullptr;
             _FillEdgeIndices(stateToCommit._indexBufferData, topologyToUse);
         }
     }
@@ -910,23 +974,36 @@ void HdVP2Mesh::_UpdateDrawItem(
         if (!normals.empty()) {
             prepareNormals = ((itemDirtyBits & HdChangeTracker::DirtyNormals) != 0);
         } else if (requireSmoothNormals && (itemDirtyBits & DirtySmoothNormals)) {
-            // note: normals gets dirty when points are marked as dirty,
-            // at change tracker.
-            // HdC_TODO: move the normals computation to GPU to save expensive
-            // computation and buffer transfer.
-            Hd_VertexAdjacencySharedPtr adjacency(new Hd_VertexAdjacency());
-            HdBufferSourceSharedPtr     adjacencyComputation
-                = adjacency->GetSharedAdjacencyBuilderComputation(&topology);
-            adjacencyComputation->Resolve(); // IS the adjacency updated now?
-
-            // Only the points referenced by the topology are used to compute
-            // smooth normals.
-            normals = Hd_SmoothNormals::ComputeSmoothNormals(
-                adjacency.get(), _meshSharedData._points.size(), _meshSharedData._points.cdata());
-
-            interp = HdInterpolationVertex;
-
-            prepareNormals = !normals.empty();
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+            if (_gpuNormalsEnabled || _gpuOSDEnabled) {
+                if (!_meshSharedData->_viewportCompute) {
+                    _CreateViewportCompute(*drawItem);
+#ifdef HDVP2_ENABLE_GPU_OSD
+                    _CreateOSDTables();
+#endif
+                }
+                _meshSharedData->_viewportCompute->setNormalVertexBufferGPUDirty();
+                prepareNormals = false;
+            } else
+#endif
+            {
+                // note: normals gets dirty when points are marked as dirty,
+                // at change tracker.
+                Hd_VertexAdjacencySharedPtr adjacency(new Hd_VertexAdjacency());
+                HdBufferSourceSharedPtr     adjacencyComputation
+                    = adjacency->GetSharedAdjacencyBuilderComputation(&topology);
+                adjacencyComputation->Resolve(); // IS the adjacency updated now?
+
+                // Only the points referenced by the topology are used to compute
+                // smooth normals.
+                normals = Hd_SmoothNormals::ComputeSmoothNormals(
+                    adjacency.get(),
+                    _meshSharedData->_points.size(),
+                    _meshSharedData->_points.cdata());
+                interp = HdInterpolationVertex;
+
+                prepareNormals = !normals.empty();
+            }
         }
 
         if (prepareNormals) {
@@ -937,13 +1014,15 @@ void HdVP2Mesh::_UpdateDrawItem(
                 drawItemData._normalsBuffer.reset(new MHWRender::MVertexBuffer(vbDesc));
             }
 
-            void* bufferData = drawItemData._normalsBuffer->acquire(numVertices, true);
+            void* bufferData = numVertices > 0
+                ? drawItemData._normalsBuffer->acquire(numVertices, true)
+                : nullptr;
             if (bufferData) {
                 _FillPrimvarData(
                     static_cast<GfVec3f*>(bufferData),
                     numVertices,
                     0,
-                    _meshSharedData._renderingToSceneFaceVtxIds,
+                    _meshSharedData->_renderingToSceneFaceVtxIds,
                     _rprimId,
                     topology,
                     HdTokens->normals,
@@ -1040,7 +1119,9 @@ void HdVP2Mesh::_UpdateDrawItem(
                     drawItemData._colorBuffer.reset(new MHWRender::MVertexBuffer(vbDesc));
                 }
 
-                void* bufferData = drawItemData._colorBuffer->acquire(numVertices, true);
+                void* bufferData = numVertices > 0
+                    ? drawItemData._colorBuffer->acquire(numVertices, true)
+                    : nullptr;
 
                 // Fill color and opacity into the float4 color stream.
                 if (bufferData) {
@@ -1048,7 +1129,7 @@ void HdVP2Mesh::_UpdateDrawItem(
                         static_cast<GfVec4f*>(bufferData),
                         numVertices,
                         0,
-                        _meshSharedData._renderingToSceneFaceVtxIds,
+                        _meshSharedData->_renderingToSceneFaceVtxIds,
                         _rprimId,
                         topology,
                         HdTokens->displayColor,
@@ -1059,7 +1140,7 @@ void HdVP2Mesh::_UpdateDrawItem(
                         static_cast<GfVec4f*>(bufferData),
                         numVertices,
                         3,
-                        _meshSharedData._renderingToSceneFaceVtxIds,
+                        _meshSharedData->_renderingToSceneFaceVtxIds,
                         _rprimId,
                         topology,
                         HdTokens->displayOpacity,
@@ -1118,13 +1199,13 @@ void HdVP2Mesh::_UpdateDrawItem(
                 }
 
                 if (buffer) {
-                    bufferData = buffer->acquire(numVertices, true);
+                    bufferData = numVertices > 0 ? buffer->acquire(numVertices, true) : nullptr;
                     if (bufferData) {
                         _FillPrimvarData(
                             static_cast<float*>(bufferData),
                             numVertices,
                             0,
-                            _meshSharedData._renderingToSceneFaceVtxIds,
+                            _meshSharedData->_renderingToSceneFaceVtxIds,
                             _rprimId,
                             topology,
                             token,
@@ -1142,13 +1223,13 @@ void HdVP2Mesh::_UpdateDrawItem(
                 }
 
                 if (buffer) {
-                    bufferData = buffer->acquire(numVertices, true);
+                    bufferData = numVertices > 0 ? buffer->acquire(numVertices, true) : nullptr;
                     if (bufferData) {
                         _FillPrimvarData(
                             static_cast<GfVec2f*>(bufferData),
                             numVertices,
                             0,
-                            _meshSharedData._renderingToSceneFaceVtxIds,
+                            _meshSharedData->_renderingToSceneFaceVtxIds,
                             _rprimId,
                             topology,
                             token,
@@ -1166,13 +1247,13 @@ void HdVP2Mesh::_UpdateDrawItem(
                 }
 
                 if (buffer) {
-                    bufferData = buffer->acquire(numVertices, true);
+                    bufferData = numVertices > 0 ? buffer->acquire(numVertices, true) : nullptr;
                     if (bufferData) {
                         _FillPrimvarData(
                             static_cast<GfVec3f*>(bufferData),
                             numVertices,
                             0,
-                            _meshSharedData._renderingToSceneFaceVtxIds,
+                            _meshSharedData->_renderingToSceneFaceVtxIds,
                             _rprimId,
                             topology,
                             token,
@@ -1190,13 +1271,13 @@ void HdVP2Mesh::_UpdateDrawItem(
                 }
 
                 if (buffer) {
-                    bufferData = buffer->acquire(numVertices, true);
+                    bufferData = numVertices > 0 ? buffer->acquire(numVertices, true) : nullptr;
                     if (bufferData) {
                         _FillPrimvarData(
                             static_cast<GfVec4f*>(bufferData),
                             numVertices,
                             0,
-                            _meshSharedData._renderingToSceneFaceVtxIds,
+                            _meshSharedData->_renderingToSceneFaceVtxIds,
                             _rprimId,
                             topology,
                             token,
@@ -1403,7 +1484,7 @@ void HdVP2Mesh::_UpdateDrawItem(
             & (HdChangeTracker::DirtyVisibility | HdChangeTracker::DirtyRenderTag
                | HdChangeTracker::DirtyPoints | HdChangeTracker::DirtyExtent
                | DirtySelectionHighlight))) {
-        bool enable = drawItem->GetVisible() && !_meshSharedData._points.empty()
+        bool enable = drawItem->GetVisible() && !_meshSharedData->_points.empty()
             && !instancerWithNoInstances;
 
         if (isDedicatedSelectionHighlightItem) {
@@ -1414,7 +1495,7 @@ void HdVP2Mesh::_UpdateDrawItem(
             enable = enable && !range.IsEmpty();
         }
 
-        enable = enable && drawScene.DrawRenderTag(_meshSharedData._renderTag);
+        enable = enable && drawScene.DrawRenderTag(_meshSharedData->_renderTag);
 
         if (drawItemData._enabled != enable) {
             drawItemData._enabled = enable;
@@ -1431,7 +1512,7 @@ void HdVP2Mesh::_UpdateDrawItem(
     drawItem->ResetDirtyBits();
 
     // Capture the valid position buffer and index buffer
-    MHWRender::MVertexBuffer* positionsBuffer = _meshSharedData._positionsBuffer.get();
+    MHWRender::MVertexBuffer* positionsBuffer = _meshSharedData->_positionsBuffer.get();
     MHWRender::MIndexBuffer*  indexBuffer = drawItemData._indexBuffer.get();
 
     if (isBBoxItem) {
@@ -1611,6 +1692,145 @@ void HdVP2Mesh::_HideAllDrawItems(const TfToken& reprToken)
     }
 }
 
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+/*! \brief  Save topology information for later GPGPU evaluation
+
+    This function pulls topology and UV data from the scene delegate and save that
+    information to be used as an input to the normal calculation later.
+*/
+void HdVP2Mesh::_CreateViewportCompute(const HdVP2DrawItem& drawItem)
+{
+    if (_meshSharedData->_viewportCompute) {
+        // I can't handle multiple draw items that require normals
+        TF_VERIFY(_meshSharedData->_viewportCompute->verifyDrawItem(drawItem));
+    } else {
+        _meshSharedData->_viewportCompute
+            = MSharedPtr<MeshViewportCompute>::make<>(_meshSharedData, &drawItem);
+        MHWRender::MRenderItem* renderItem = drawItem.GetRenderItem();
+        renderItem->addViewportComputeItem(_meshSharedData->_viewportCompute);
+    }
+}
+#endif
+
+#ifdef HDVP2_ENABLE_GPU_OSD
+void HdVP2Mesh::_CreateOSDTables()
+{
+#if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
+    if (!_gpuOSDEnabled)
+        return;
+
+    assert(_meshSharedData->_viewportCompute);
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory, MProfiler::kColorD_L2, "createOSDTables");
+
+    // create topology refiner
+    PxOsdTopologyRefinerSharedPtr refiner;
+
+    OpenSubdiv::Far::StencilTable const* vertexStencils = nullptr;
+    OpenSubdiv::Far::StencilTable const* varyingStencils = nullptr;
+    OpenSubdiv::Far::PatchTable const*   patchTable = nullptr;
+
+    HdMeshTopology* topology
+        = &_meshSharedData->_renderingTopology; // TODO: something with _topology?
+
+    // for empty topology, we don't need to refine anything.
+    // but still need to return the typed buffer for codegen
+    if (topology->GetFaceVertexCounts().size() == 0) {
+        // leave refiner empty
+    } else {
+        refiner = PxOsdRefinerFactory::Create(
+            topology->GetPxOsdMeshTopology(), TfToken(_meshSharedData->_renderTag.GetText()));
+    }
+
+    if (refiner) {
+        OpenSubdiv::Far::PatchTableFactory::Options patchOptions(
+            _meshSharedData->_viewportCompute->level);
+        if (_meshSharedData->_viewportCompute->adaptive) {
+            patchOptions.endCapType
+                = OpenSubdiv::Far::PatchTableFactory::Options::ENDCAP_BSPLINE_BASIS;
+#if OPENSUBDIV_VERSION_NUMBER >= 30400
+            // Improve fidelity when refining to limit surface patches
+            // These options supported since v3.1.0 and v3.2.0 respectively.
+            patchOptions.useInfSharpPatch = true;
+            patchOptions.generateLegacySharpCornerPatches = false;
+#endif
+        }
+
+        // split trace scopes.
+        {
+            MProfilingScope subProfilingScope(
+                HdVP2RenderDelegate::sProfilerCategory, MProfiler::kColorD_L2, "refine");
+            if (_meshSharedData->_viewportCompute->adaptive) {
+                OpenSubdiv::Far::TopologyRefiner::AdaptiveOptions adaptiveOptions(
+                    _meshSharedData->_viewportCompute->level);
+#if OPENSUBDIV_VERSION_NUMBER >= 30400
+                adaptiveOptions = patchOptions.GetRefineAdaptiveOptions();
+#endif
+                refiner->RefineAdaptive(adaptiveOptions);
+            } else {
+                refiner->RefineUniform(_meshSharedData->_viewportCompute->level);
+            }
+        }
+#define GENERATE_SOURCE_TABLES
+#ifdef GENERATE_SOURCE_TABLES
+        {
+            MProfilingScope subProfilingScope(
+                HdVP2RenderDelegate::sProfilerCategory, MProfiler::kColorD_L2, "stencilFactory");
+            OpenSubdiv::Far::StencilTableFactory::Options options;
+            options.generateOffsets = true;
+            options.generateIntermediateLevels = _meshSharedData->_viewportCompute->adaptive;
+            options.interpolationMode = OpenSubdiv::Far::StencilTableFactory::INTERPOLATE_VERTEX;
+            vertexStencils = OpenSubdiv::Far::StencilTableFactory::Create(*refiner, options);
+
+            options.interpolationMode = OpenSubdiv::Far::StencilTableFactory::INTERPOLATE_VARYING;
+            varyingStencils = OpenSubdiv::Far::StencilTableFactory::Create(*refiner, options);
+        }
+        {
+            MProfilingScope subProfilingScope(
+                HdVP2RenderDelegate::sProfilerCategory, MProfiler::kColorD_L2, "patchFactory");
+            patchTable = OpenSubdiv::Far::PatchTableFactory::Create(*refiner, patchOptions);
+        }
+#else
+        // grab the values we need from the refiner.
+        const OpenSubdiv::Far::TopologyLevel& refinedLevel
+            = refiner->GetLevel(refiner->GetMaxLevel());
+        size_t indexLength = refinedLevel.GetNumFaces()
+            * 4; // i know it is quads but not always? can we do this more safely?
+        size_t vertexLength = GetNumVerticesTotal();
+        // save these values and use them to create the updated geometry index mapping.
+#endif
+    }
+#ifdef GENERATE_SOURCE_TABLES
+    // merge endcap
+    if (patchTable && patchTable->GetLocalPointStencilTable()) {
+        // append stencils
+        if (OpenSubdiv::Far::StencilTable const* vertexStencilsWithLocalPoints
+            = OpenSubdiv::Far::StencilTableFactory::AppendLocalPointStencilTable(
+                *refiner, vertexStencils, patchTable->GetLocalPointStencilTable())) {
+            delete vertexStencils;
+            vertexStencils = vertexStencilsWithLocalPoints;
+        }
+        if (OpenSubdiv::Far::StencilTable const* varyingStencilsWithLocalPoints
+            = OpenSubdiv::Far::StencilTableFactory::AppendLocalPointStencilTable(
+                *refiner, varyingStencils, patchTable->GetLocalPointStencilTable())) {
+            delete varyingStencils;
+            varyingStencils = varyingStencilsWithLocalPoints;
+        }
+    }
+
+    // save values for the next loop
+    _meshSharedData->_viewportCompute->vertexStencils.reset(vertexStencils);
+    _meshSharedData->_viewportCompute->varyingStencils.reset(varyingStencils);
+    _meshSharedData->_viewportCompute->patchTable.reset(patchTable);
+#endif
+
+    // if there is a sourceMeshSharedData it should have entries for every vertex in that geometry
+    // source.
+
+#endif
+}
+#endif
+
 /*! \brief  Update _primvarSourceMap, our local cache of raw primvar data.
 
     This function pulls data from the scene delegate, but defers processing.
@@ -1637,10 +1857,10 @@ void HdVP2Mesh::_UpdatePrimvarSources(
             if (std::find(begin, end, pv.name) != end) {
                 if (HdChangeTracker::IsPrimvarDirty(dirtyBits, id, pv.name)) {
                     const VtValue value = GetPrimvar(sceneDelegate, pv.name);
-                    _meshSharedData._primvarSourceMap[pv.name] = { value, interp };
+                    _meshSharedData->_primvarSourceMap[pv.name] = { value, interp };
                 }
             } else {
-                _meshSharedData._primvarSourceMap.erase(pv.name);
+                _meshSharedData->_primvarSourceMap.erase(pv.name);
             }
         }
     }
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
index e76c633e32..9249a64156 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
@@ -16,7 +16,7 @@
 #ifndef HD_VP2_MESH
 #define HD_VP2_MESH
 
-#include <mayaUsd/render/vp2RenderDelegate/proxyRenderDelegate.h>
+#include "meshViewportCompute.h"
 
 #include <pxr/imaging/hd/mesh.h>
 #include <pxr/pxr.h>
@@ -64,6 +64,10 @@ struct HdVP2MeshSharedData
     //! The number of vertices in each vertex buffer.
     size_t _numVertices;
 
+    //! An array to store a rendering face vertex index for each original scene
+    //! face vertex index.
+    std::vector<int> _sceneToRenderingFaceVtxIds;
+
     //! A local cache of primvar scene data. "data" is a copy-on-write handle to
     //! the actual primvar buffer, and "interpolation" is the interpolation mode
     //! to be used.
@@ -78,6 +82,9 @@ struct HdVP2MeshSharedData
 
     //! Render tag of the Rprim.
     TfToken _renderTag;
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+    MSharedPtr<MeshViewportCompute> _viewportCompute;
+#endif
 };
 
 /*! \brief  VP2 representation of poly-mesh object.
@@ -117,6 +124,13 @@ class HdVP2Mesh final : public HdMesh
 
     void _UpdateRepr(HdSceneDelegate*, const TfToken&);
 
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+    void _CreateViewportCompute(const HdVP2DrawItem& drawItem);
+#endif
+#ifdef HDVP2_ENABLE_GPU_OSD
+    void _CreateOSDTables();
+#endif
+
     void _UpdateDrawItem(
         HdSceneDelegate*,
         HdVP2DrawItem*,
@@ -137,6 +151,8 @@ class HdVP2Mesh final : public HdMesh
     MHWRender::MRenderItem* _CreatePointsRenderItem(const MString& name) const;
     MHWRender::MRenderItem* _CreateBoundingBoxRenderItem(const MString& name) const;
 
+    static void _InitGPUCompute();
+
     //! Custom dirty bits used by this mesh
     enum DirtyBits : HdDirtyBits
     {
@@ -156,10 +172,25 @@ class HdVP2Mesh final : public HdMesh
         0
     };                      //!< Storage for custom dirty bits. See _PropagateDirtyBits for details.
     const MString _rprimId; //!< Rprim id cached as a maya string for easier debugging and profiling
-    HdVP2MeshSharedData _meshSharedData; //!< Shared data for all draw items of the Rprim
+    std::shared_ptr<HdVP2MeshSharedData>
+        _meshSharedData; //!< Shared data for all draw items of the Rprim
 
     //! Selection status of the Rprim
     HdVP2SelectionStatus _selectionStatus { kUnselected };
+
+    //! Control GPU compute behavior
+    //! Having these in place even without HDVP2_ENABLE_GPU_COMPUTE or HDVP2_ENABLE_GPU_OSD defined
+    //! makes the expressions using these variables much simpler
+    bool _gpuNormalsEnabled { true }; //!< Use GPU Compute for normal calculation, only used when
+                                      //!< HDVP2_ENABLE_GPU_COMPUTE is defined
+    bool _gpuOSDEnabled {
+#ifdef HDVP2_ENABLE_GPU_OSD
+        true
+#else
+        false
+#endif
+    }; //!< Use GPU Compute for OSD, only used when HDVP2_ENABLE_GPU_OSD is defined
+    static int _gpuNormalsComputeThreshold;
 };
 
 PXR_NAMESPACE_CLOSE_SCOPE
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp
new file mode 100644
index 0000000000..5cf0b049c5
--- /dev/null
+++ b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp
@@ -0,0 +1,1335 @@
+//
+// Copyright 2020 Autodesk
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "meshViewportCompute.h"
+
+#include "mesh.h"
+#include "render_delegate.h"
+
+#include <mayaUsd/render/vp2RenderDelegate/proxyRenderDelegate.h>
+
+#include <pxr/imaging/hd/vertexAdjacency.h>
+
+#include <maya/MProfiler.h>
+
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+
+PXR_NAMESPACE_OPEN_SCOPE
+
+namespace {
+
+std::string _GetResourcePath(const std::string& resource)
+{
+    static PlugPluginPtr plugin
+        = PlugRegistry::GetInstance().GetPluginWithName("mayaUsd_ComputeShaders");
+    if (!TF_VERIFY(plugin, "Could not get plugin\n")) {
+        return std::string();
+    }
+
+    const std::string path = PlugFindPluginResource(plugin, resource);
+    TF_VERIFY(!path.empty(), "Cound not find resource: %s\n", resource.c_str());
+
+    return path;
+}
+
+template <typename F> class LambdaTask : public tbb::task
+{
+public:
+    LambdaTask(const F& func)
+        : _func(func)
+    {
+    }
+
+private:
+    tbb::task* execute()
+    {
+        _func();
+        return nullptr;
+    }
+
+    F _func;
+};
+
+template <typename F> void EnqueueLambdaTask(const F& f)
+{
+    tbb::task::enqueue(*new (tbb::task::allocate_root()) LambdaTask<F>(f));
+}
+} // namespace
+
+std::once_flag      MeshViewportCompute::_compileProgramOnce;
+PxrMayaGLSLProgram* MeshViewportCompute::_computeNormalsProgram;
+
+bool MeshViewportCompute::verifyDrawItem(const HdVP2DrawItem& drawItem) const
+{
+    return &drawItem == _drawItem;
+}
+
+void MeshViewportCompute::openGLErrorCheck()
+{
+//#define DO_OPENGL_ERROR_CHECK
+#ifdef DO_OPENGL_ERROR_CHECK
+    glFinish();
+    // Check for errors.
+    GLenum err = glGetError();
+    TF_VERIFY(GL_NO_ERROR == err);
+#endif
+}
+
+void MeshViewportCompute::setTopologyDirty()
+{
+    _topologyDirty = true;
+    _executed = false;
+    _vertexCount = 0;
+    if (_consolidatedCompute) {
+        _consolidatedCompute->_topologyDirty = true;
+        _consolidatedCompute->_vertexCount = 0;
+    }
+}
+
+void MeshViewportCompute::setAdjacencyBufferGPUDirty()
+{
+    _adjacencyBufferGPUDirty = true;
+    _executed = false;
+    if (_consolidatedCompute)
+        _consolidatedCompute->_adjacencyBufferGPUDirty = true;
+}
+
+void MeshViewportCompute::setNormalVertexBufferGPUDirty()
+{
+    _normalVertexBufferGPUDirty = true;
+    _executed = false;
+    if (_consolidatedCompute)
+        _consolidatedCompute->_normalVertexBufferGPUDirty = true;
+}
+
+void MeshViewportCompute::reset()
+{
+    /*  don't clear _meshSharedData, it's either an input from the external HdVP2Mesh
+        or it has been created explicitly for this consolidated viewport compute.
+    */
+    _drawItem = nullptr;
+    _executed = false;
+    _sourcesExecuted = false;
+
+    _consolidatedCompute.reset();
+    _geometryIndexMapping.reset();
+    _vertexCount = 0;
+
+    _adjacencyBufferSize = 0;
+    _adjacencyBufferCPU.reset();
+    _adjacencyBufferGPU.reset();
+    _renderingToSceneFaceVtxIdsGPU.reset();
+    _sceneToRenderingFaceVtxIdsGPU.reset();
+
+    fRenderGeom = nullptr;
+
+    _positionVertexBufferGPU = nullptr;
+    _normalVertexBufferGPU = nullptr;
+    _colorVertexBufferGPU = nullptr;
+
+    _topologyDirty = true;
+    _adjacencyBufferGPUDirty = true;
+    _normalVertexBufferGPUDirty = true;
+
+#if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
+    // OSD information
+    _vertexStencils.reset();
+    _varyingStencils.reset();
+    _patchTable.reset();
+#endif
+}
+
+bool MeshViewportCompute::hasExecuted() const { return _executed; }
+
+void MeshViewportCompute::findConsolidationMapping(MRenderItem& renderItem)
+{
+    // If the item is not consolidated clear any stale consolidatedCompute information
+    if (!renderItem.isConsolidated()) {
+        _consolidatedCompute.reset();
+        _sourcesExecuted = false;
+        return;
+    }
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:MGeometryIndexMapping");
+
+    if (_geometryIndexMapping) {
+        if (renderItem.isSourceIndexMappingValid(*_geometryIndexMapping.get()))
+            return;
+    }
+
+    reset();
+    _geometryIndexMapping.reset(new MHWRender::MGeometryIndexMapping());
+    renderItem.sourceIndexMapping(*_geometryIndexMapping.get());
+
+    for (int i = 0; i < _geometryIndexMapping->geometryCount(); i++) {
+        MRenderItem*                    sourceItem = _geometryIndexMapping->sourceRenderItem(i);
+        MSharedPtr<MeshViewportCompute> sourceViewportComputeItem
+            = MSharedPtr<MeshViewportCompute>::dynamic_pointer_cast<>(
+                sourceItem->viewportComputeItem());
+        sourceViewportComputeItem->_consolidatedCompute
+            = MSharedPtr<MeshViewportCompute>::dynamic_pointer_cast<>(
+                renderItem.viewportComputeItem());
+        TF_VERIFY(i == 0 || _sourcesExecuted == sourceViewportComputeItem->hasExecuted());
+        _sourcesExecuted = sourceViewportComputeItem->hasExecuted();
+    }
+}
+
+HdMeshTopology& getSceneTopology(HdVP2MeshSharedData* meshSharedData)
+{
+    return meshSharedData->_topology;
+}
+
+HdMeshTopology& getRenderingTopology(HdVP2MeshSharedData* meshSharedData)
+{
+    return meshSharedData->_renderingTopology;
+}
+
+template <typename TopologyAccessor>
+void MeshViewportCompute::createConsolidatedTopology(TopologyAccessor getTopology)
+{
+    if (!_topologyDirty && getTopology(_meshSharedData.get()).GetNumPoints() > 0)
+        return;
+    _topologyDirty = false;
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:createConsolidatedTopology");
+
+    size_t faceVertexCountsSize = 0;
+    size_t faceVertexIndicesSize = 0;
+    size_t holeIndicesSize = 0;
+    size_t sceneToRenderingFaceVtxIdsCount = 0;
+    size_t vertexCount = 0;
+
+    // figure out the size of the consolidated mesh topology
+    if (_geometryIndexMapping.get()) {
+        for (int i = 0; i < _geometryIndexMapping->geometryCount(); i++) {
+            MRenderItem* sourceItem = _geometryIndexMapping->sourceRenderItem(i);
+
+            MSharedPtr<MeshViewportCompute> sourceViewportComputeItem
+                = MSharedPtr<MeshViewportCompute>::dynamic_pointer_cast<>(
+                    sourceItem->viewportComputeItem());
+            HdVP2MeshSharedData* sourceMeshSharedData
+                = sourceViewportComputeItem->_meshSharedData.get();
+            TF_VERIFY(sourceMeshSharedData);
+            HdMeshTopology& sourceTopology = getTopology(sourceMeshSharedData);
+
+            faceVertexCountsSize += sourceTopology.GetNumFaces();
+            faceVertexIndicesSize += sourceTopology.GetFaceVertexIndices().size();
+            holeIndicesSize += sourceTopology.GetHoleIndices().size();
+            // if there is a sourceMeshSharedData it should have entries for every vertex in that
+            // geometry source.
+            vertexCount += sourceTopology.GetNumPoints();
+            sceneToRenderingFaceVtxIdsCount += sourceTopology.GetNumPoints();
+        }
+    }
+
+    TF_VERIFY(faceVertexCountsSize > 0 || !_geometryIndexMapping.get());
+
+    // check to see if it is actually a consolidated geometry that needs a consolidated adjacency
+    // buffer
+    if (faceVertexCountsSize > 0) {
+        TfToken    scheme = PxOsdOpenSubdivTokens->catmullClark;
+        TfToken    orientation = PxOsdOpenSubdivTokens->rightHanded;
+        VtIntArray faceVertexCounts;
+        VtIntArray faceVertexIndices;
+        VtIntArray holeIndices;
+        int        refineLevel = 0;
+
+        faceVertexCounts.reserve(faceVertexCountsSize);
+        faceVertexIndices.reserve(faceVertexIndicesSize);
+        holeIndices.reserve(holeIndicesSize);
+        _meshSharedData->_renderingToSceneFaceVtxIds.clear();
+        _meshSharedData->_renderingToSceneFaceVtxIds.reserve(vertexCount);
+        _meshSharedData->_sceneToRenderingFaceVtxIds.clear();
+        _meshSharedData->_sceneToRenderingFaceVtxIds.reserve(sceneToRenderingFaceVtxIdsCount);
+
+        for (int sourceIndex = 0; sourceIndex < _geometryIndexMapping->geometryCount();
+             sourceIndex++) {
+            MRenderItem* sourceItem = _geometryIndexMapping->sourceRenderItem(sourceIndex);
+            int consolidatedBufferVertexOffset = _geometryIndexMapping->vertexStart(sourceIndex);
+            MSharedPtr<MeshViewportCompute> sourceViewportComputeItem
+                = MSharedPtr<MeshViewportCompute>::dynamic_pointer_cast<>(
+                    sourceItem->viewportComputeItem());
+            HdVP2MeshSharedData* sourceMeshSharedData
+                = sourceViewportComputeItem->_meshSharedData.get();
+            TF_VERIFY(sourceMeshSharedData);
+
+            HdMeshTopology& sourceTopology = getTopology(sourceMeshSharedData);
+
+            TF_VERIFY(sourceTopology.GetScheme() == scheme);
+            TF_VERIFY(sourceTopology.GetOrientation() == orientation);
+            TF_VERIFY(sourceTopology.GetRefineLevel() == refineLevel);
+
+            VtIntArray const& sourceFaceVertexCounts = sourceTopology.GetFaceVertexCounts();
+            for (int faceId = 0; faceId < sourceFaceVertexCounts.size(); faceId++) {
+                faceVertexCounts.push_back(sourceFaceVertexCounts[faceId]);
+            }
+
+            VtIntArray const& sourceFaceVertexIndices = sourceTopology.GetFaceVertexIndices();
+            for (int faceVertexId = 0; faceVertexId < sourceFaceVertexIndices.size();
+                 faceVertexId++) {
+                faceVertexIndices.push_back(
+                    sourceFaceVertexIndices[faceVertexId] + consolidatedBufferVertexOffset);
+            }
+
+            VtIntArray const& sourceHoleIndices = sourceTopology.GetHoleIndices();
+            for (int faceId = 0; faceId < sourceHoleIndices.size(); faceId++) {
+                holeIndices.push_back(
+                    sourceHoleIndices[faceId] + consolidatedBufferVertexOffset); // untested?
+            }
+
+            for (int idx = 0; idx < sourceMeshSharedData->_renderingToSceneFaceVtxIds.size();
+                 idx++) {
+                _meshSharedData->_renderingToSceneFaceVtxIds.push_back(
+                    sourceMeshSharedData->_renderingToSceneFaceVtxIds[idx]
+                    + consolidatedBufferVertexOffset);
+            }
+
+            // add padding to _sceneToRenderingFaceVtxIds because the scene IDs start at
+            // consolidatedBufferVertexOffset
+            while (consolidatedBufferVertexOffset
+                   > _meshSharedData->_sceneToRenderingFaceVtxIds.size()) {
+                _meshSharedData->_sceneToRenderingFaceVtxIds.push_back(-1);
+            }
+
+            for (int idx = 0; idx < sourceMeshSharedData->_sceneToRenderingFaceVtxIds.size();
+                 idx++) {
+                _meshSharedData->_sceneToRenderingFaceVtxIds.push_back(
+                    sourceMeshSharedData->_sceneToRenderingFaceVtxIds[idx]
+                    + consolidatedBufferVertexOffset);
+            }
+        }
+
+        HdMeshTopology consolidatedTopology(
+            scheme, orientation, faceVertexCounts, faceVertexIndices, holeIndices, refineLevel);
+        getTopology(_meshSharedData.get()) = consolidatedTopology;
+    } else {
+        // It is not a consolidated draw item. There is only a single topology so we can use that
+        // directly In fact it is already there, nothing to do!
+        vertexCount = _meshSharedData->_renderingTopology.GetNumPoints();
+
+        // the non-consolidated topology doesn't get updated with _sceneToRenderingFaceVtxIds, I
+        // guess we have to do it in the kernel? Or have different storage for a topology we modify.
+
+        // Can't modify _meshSharedData if we are not consolidated!
+    }
+
+    TF_VERIFY(vertexCount == 0 || vertexCount == _vertexCount);
+    _vertexCount = vertexCount;
+}
+
+void MeshViewportCompute::createConsolidatedAdjacency()
+{
+    if (_adjacencyBufferSize > 0)
+        return;
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:createConsolidatedAdjacency");
+
+    bool isConsolidated = nullptr != _geometryIndexMapping.get();
+
+    Hd_VertexAdjacencySharedPtr adjacency(new Hd_VertexAdjacency());
+    HdBufferSourceSharedPtr     adjacencyComputation
+        = adjacency->GetSharedAdjacencyBuilderComputation(&_meshSharedData->_topology);
+    adjacencyComputation->Resolve();
+
+    const VtIntArray& adjacencyTable = adjacency->GetAdjacencyTable();
+    size_t            adjacencyBufferSize = adjacencyTable.size();
+    int*              adjCopy = new int[adjacencyBufferSize];
+    memcpy(adjCopy, adjacencyTable.data(), adjacencyBufferSize * sizeof(int));
+    _adjacencyBufferCPU.reset(
+        adjCopy); // make sure this is really using the const version and doesn't copy the data.
+    _adjacencyBufferSize = adjacencyTable.size();
+}
+
+void MeshViewportCompute::findRenderGeometry(MRenderItem& renderItem)
+{
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:findRenderGeometry");
+
+    MGeometry* renderGeometry = renderItem.geometry();
+    if (fRenderGeom && (fRenderGeom != renderGeometry)) {
+        _positionVertexBufferGPU = nullptr;
+        _normalVertexBufferGPU = nullptr;
+        _colorVertexBufferGPU = nullptr;
+        fRenderGeom = nullptr;
+    }
+    fRenderGeom = renderGeometry;
+}
+
+void MeshViewportCompute::createConsolidatedOSDTables(MRenderItem& renderItem)
+{
+#if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:consolidatedOSDTables");
+
+    // refine
+    //  and
+    // create stencil/patch table
+    {
+        OpenSubdiv::Far::StencilTable const* consolidatedVertexStencils = nullptr;
+        OpenSubdiv::Far::StencilTable const* consolidatedVaryingStencils = nullptr;
+        OpenSubdiv::Far::PatchTable const*   consolidatedPatchTable = nullptr;
+
+        // if this is a consolidated item then we won't have any stencils or tables.
+        // If this is an unconsolidated item then we'll already have the tables we need.
+        if (!_vertexStencils || !_varyingStencils || !_patchTable) {
+            MProfilingScope subsubProfilingScope(
+                HdVP2RenderDelegate::sProfilerCategory,
+                MProfiler::kColorD_L2,
+                "MeshViewportCompute:createConsolidatedMeshTables");
+
+            // create topology refiner
+            PxOsdTopologyRefinerSharedPtr refiner;
+            // for empty topology, we don't need to refine anything.
+            // but still need to return the typed buffer for codegen
+            if (_meshSharedData->_renderingTopology.GetFaceVertexCounts().size() == 0) {
+                // leave refiner empty
+            } else {
+                refiner = PxOsdRefinerFactory::Create(
+                    _meshSharedData->_renderingTopology.GetPxOsdMeshTopology(),
+                    TfToken(_meshSharedData->_renderTag.GetText()));
+            }
+
+            if (refiner) {
+                OpenSubdiv::Far::PatchTableFactory::Options patchOptions(_level);
+                if (_adaptive) {
+                    patchOptions.endCapType
+                        = OpenSubdiv::Far::PatchTableFactory::Options::ENDCAP_BSPLINE_BASIS;
+#if OPENSUBDIV_VERSION_NUMBER >= 30400
+                    // Improve fidelity when refining to limit surface patches
+                    // These options supported since v3.1.0 and v3.2.0 respectively.
+                    patchOptions.useInfSharpPatch = true;
+                    patchOptions.generateLegacySharpCornerPatches = false;
+#endif
+                }
+
+                // split trace scopes.
+                {
+                    MProfilingScope subsubsubProfilingScope(
+                        HdVP2RenderDelegate::sProfilerCategory,
+                        MProfiler::kColorD_L2,
+                        "MeshViewportCompute:refine");
+                    if (_adaptive) {
+                        OpenSubdiv::Far::TopologyRefiner::AdaptiveOptions adaptiveOptions(_level);
+#if OPENSUBDIV_VERSION_NUMBER >= 30400
+                        adaptiveOptions = patchOptions.GetRefineAdaptiveOptions();
+#endif
+                        refiner->RefineAdaptive(adaptiveOptions);
+                    } else {
+                        refiner->RefineUniform(_level);
+                    }
+                }
+                {
+                    MProfilingScope subsubsubProfilingScope(
+                        HdVP2RenderDelegate::sProfilerCategory,
+                        MProfiler::kColorD_L2,
+                        "MeshViewportCompute:stencilFactory");
+                    OpenSubdiv::Far::StencilTableFactory::Options options;
+                    options.generateOffsets = true;
+                    options.generateIntermediateLevels = _adaptive;
+                    options.interpolationMode
+                        = OpenSubdiv::Far::StencilTableFactory::INTERPOLATE_VERTEX;
+                    consolidatedVertexStencils
+                        = OpenSubdiv::Far::StencilTableFactory::Create(*refiner, options);
+
+                    options.interpolationMode
+                        = OpenSubdiv::Far::StencilTableFactory::INTERPOLATE_VARYING;
+                    consolidatedVaryingStencils
+                        = OpenSubdiv::Far::StencilTableFactory::Create(*refiner, options);
+                }
+                {
+                    MProfilingScope subsubsubProfilingScope(
+                        HdVP2RenderDelegate::sProfilerCategory,
+                        MProfiler::kColorD_L2,
+                        "MeshViewportCompute:patchFactory");
+                    consolidatedPatchTable
+                        = OpenSubdiv::Far::PatchTableFactory::Create(*refiner, patchOptions);
+                }
+            }
+
+            // merge endcap
+            if (consolidatedPatchTable && consolidatedPatchTable->GetLocalPointStencilTable()) {
+                // append stencils
+                if (OpenSubdiv::Far::StencilTable const* vertexStencilsWithLocalPoints
+                    = OpenSubdiv::Far::StencilTableFactory::AppendLocalPointStencilTable(
+                        *refiner,
+                        consolidatedVertexStencils,
+                        consolidatedPatchTable->GetLocalPointStencilTable())) {
+                    delete consolidatedVertexStencils;
+                    consolidatedVertexStencils = vertexStencilsWithLocalPoints;
+                }
+                if (OpenSubdiv::Far::StencilTable const* varyingStencilsWithLocalPoints
+                    = OpenSubdiv::Far::StencilTableFactory::AppendLocalPointStencilTable(
+                        *refiner,
+                        consolidatedVaryingStencils,
+                        consolidatedPatchTable->GetLocalPointStencilTable())) {
+                    delete consolidatedVaryingStencils;
+                    consolidatedVaryingStencils = varyingStencilsWithLocalPoints;
+                }
+            }
+
+            // save tables to topology
+            _vertexStencils.reset(consolidatedVertexStencils);
+            _varyingStencils.reset(consolidatedVaryingStencils);
+            _patchTable.reset(consolidatedPatchTable);
+        }
+    }
+
+    if (_geometryIndexMapping && _geometryIndexMapping->geometryCount() > 0) {
+        MProfilingScope subsubProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L2,
+            "MeshViewportCompute:updateIndexMapping");
+
+        // TODO: assumes quads
+        int indexStart = 0;
+        int vertexStart = 0;
+        for (int i = 0; i < _geometryIndexMapping->geometryCount(); i++) {
+            MRenderItem* sourceItem = _geometryIndexMapping->sourceRenderItem(i);
+
+            // we can have a nullptr sourceMeshSharedData, but if we do all the sourceItems should
+            // have a nullptr
+            MSharedPtr<MeshViewportCompute> sourceViewportComputeItem
+                = MSharedPtr<MeshViewportCompute>::dynamic_pointer_cast<>(
+                    sourceItem->viewportComputeItem());
+            size_t sourcePtableSize
+                = sourceViewportComputeItem->_patchTable->GetPatchControlVerticesTable().size();
+            size_t sourceBaseVertexCount
+                = sourceViewportComputeItem->_meshSharedData->_renderingTopology
+                      .GetNumPoints(); // _geometryIndexMapping->vertexLength(i);
+            size_t sourceSmoothVertexCount
+                = sourceViewportComputeItem->_vertexStencils->GetNumStencils();
+            _geometryIndexMapping->updateSource(
+                i,
+                indexStart,
+                sourcePtableSize * 1.5,
+                vertexStart,
+                sourceBaseVertexCount + sourceSmoothVertexCount);
+            indexStart += sourcePtableSize * 1.5;
+            // vertexStart += sourceBaseVertexCount + sourceSmoothVertexCount; multiDrawIndirect
+            // adds vertexStart to each vertexId for the draw. The index buffer I have is a fully
+            // consolidated index buffer, so I don't need to use that.
+        }
+        renderItem.setSourceIndexMapping(*_geometryIndexMapping.get());
+    }
+
+    MProfilingScope subsubProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:triangulateSmoothPatchTable");
+
+    // Inspired by HdSt_Osd3IndexComputation::Resolve()
+    // PxOsdOpenSubdivTokens->loop -> _patchTable is triangles
+    // PxOsdOpenSubdivTokens->catmullClark + _adaptive -> BSplinePatches
+    // PxOsdOpenSubdivTokens->catmullClark + !_adaptive -> quads
+    // HdSt draws with tessellation shaders and we do have that? try calling
+    OpenSubdiv::Far::Index const* firstIndex = nullptr;
+    size_t                        ptableSize = 0;
+    if (_patchTable) {
+        ptableSize = _patchTable->GetPatchControlVerticesTable().size();
+        if (ptableSize > 0)
+            firstIndex = &_patchTable->GetPatchControlVerticesTable()[0];
+    }
+
+    int indexLength = 0;
+
+    if (!_adaptive
+        && _meshSharedData->_renderingTopology.GetScheme() == PxOsdOpenSubdivTokens->catmullClark) {
+        // _patchTable is quads. Convert to triangles and make an index buffer we can draw
+        int patchSize
+            = _patchTable ? _patchTable->GetPatchArrayDescriptor(0).GetNumControlVertices() : 0;
+        TF_VERIFY(patchSize == 4);
+        VtArray<int> indices(ptableSize);
+        memcpy(indices.data(), firstIndex, ptableSize * sizeof(int));
+
+        {
+            MProfilingScope subsubProfilingScope(
+                HdVP2RenderDelegate::sProfilerCategory,
+                MProfiler::kColorD_L1,
+                "MeshViewportCompute:createTriangleIndexBuffer");
+
+            MIndexBuffer* indexBuffer = fRenderGeom->indexBuffer(0);
+            if (!indexBuffer) {
+                indexBuffer = fRenderGeom->createIndexBuffer(MGeometry::kInt32);
+            }
+            // The new size of the index buffer needs to be 50% larger than the patch table size
+            // when the patch table is quads.
+            void* indexData = indexBuffer->acquire(
+                ptableSize * 1.5,
+                true); // we are not going to use any of the old indexing so this is write only
+            int* indexWriteLocation = (int*)indexData;
+            for (size_t faceStart = 0; faceStart < ptableSize; faceStart += 4) {
+                *(indexWriteLocation++) = indices[faceStart];
+                *(indexWriteLocation++) = indices[faceStart + 1];
+                *(indexWriteLocation++) = indices[faceStart + 2];
+
+                *(indexWriteLocation++) = indices[faceStart];
+                *(indexWriteLocation++) = indices[faceStart + 2];
+                *(indexWriteLocation++) = indices[faceStart + 3];
+            }
+            indexBuffer->commit(indexData);
+            indexLength = ptableSize;
+        }
+    } else if (_meshSharedData->_renderingTopology.GetScheme() == PxOsdOpenSubdivTokens->loop) {
+        int patchSize
+            = _patchTable ? _patchTable->GetPatchArrayDescriptor(0).GetNumControlVertices() : 0;
+        TF_VERIFY(patchSize == 3);
+        MIndexBuffer* indexBuffer = fRenderGeom->indexBuffer(0);
+        void*         indexData = indexBuffer->acquire(ptableSize, true);
+        memcpy(indexData, firstIndex, ptableSize * sizeof(int));
+        indexBuffer->commit(indexData);
+    } else {
+        // I can't handle it. Need to use patch drawing with
+        // MRenderItem::setPrimitive(MGeometry::kPatch, stride)
+        // But I don't have a shader set up with Tessellation.
+    }
+#endif
+}
+
+void MeshViewportCompute::findVertexBuffers(MRenderItem& renderItem)
+{
+    if (_positionVertexBufferGPU) {
+        return;
+    }
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:findVertexBuffers");
+
+    for (int bufferIndex = 0; bufferIndex < fRenderGeom->vertexBufferCount(); bufferIndex++) {
+        MVertexBuffer* renderBuffer = fRenderGeom->vertexBuffer(bufferIndex);
+        TF_VERIFY(renderBuffer->resourceHandle());
+        const MVertexBufferDescriptor& descriptor = renderBuffer->descriptor();
+
+        if (MGeometry::kPosition == descriptor.semantic()) {
+            MProfilingScope subsubProfilingScope(
+                HdVP2RenderDelegate::sProfilerCategory,
+                MProfiler::kColorD_L2,
+                "MeshViewportCompute:positionBufferResourceHandle");
+            TF_VERIFY(renderBuffer->vertexCount() == _vertexCount);
+            _positionVertexBufferGPU = renderBuffer;
+        } else if (MGeometry::kNormal == descriptor.semantic()) {
+            MProfilingScope subsubProfilingScope(
+                HdVP2RenderDelegate::sProfilerCategory,
+                MProfiler::kColorD_L2,
+                "MeshViewportCompute:normalBufferResourceHandle");
+            _normalVertexBufferGPU = renderBuffer;
+        } else if (MGeometry::kColor == descriptor.semantic()) {
+            _colorVertexBufferGPU = renderBuffer;
+        } else {
+            fprintf(stderr, "Unsupported buffer type.\n");
+            continue;
+        }
+    }
+
+    if (nullptr == _normalVertexBufferGPU) {
+        MProfilingScope subsubProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L1,
+            "MeshViewportCompute:createNormalBuffer");
+
+        const MHWRender::MVertexBufferDescriptor vbDesc(
+            "", MHWRender::MGeometry::kNormal, MHWRender::MGeometry::kFloat, 3);
+
+        _normalVertexBufferGPU = fRenderGeom->createVertexBuffer(vbDesc);
+    }
+
+    GLuint* normalBufferResourceHandle = (GLuint*)_normalVertexBufferGPU->resourceHandle();
+    if (!normalBufferResourceHandle) {
+        // tell the buffer what size it is
+        void* normalsBufferData = _normalVertexBufferGPU->acquire(_vertexCount, true);
+        memset(normalsBufferData, 0, _vertexCount * sizeof(float) * 3);
+        _normalVertexBufferGPU->commit(normalsBufferData);
+    }
+}
+
+void MeshViewportCompute::prepareAdjacencyBuffer()
+{
+#if defined(HDVP2_OPENGL_NORMALS)
+    if (!_adjacencyBufferGPUDirty)
+        return;
+    _adjacencyBufferGPUDirty = false;
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:prepareAdjacencyBuffer");
+
+    // we compute the number of normals as required by the topology.
+
+    // https://www.khronos.org/assets/uploads/developers/library/2014-siggraph-bof/KITE-BOF_Aug14.pdf
+    // https://github.com/PixarAnimationStudios/USD/blob/be1a80f8cb91133ac75e1fc2a2e1832cd10d91c8/pxr/imaging/hdSt/smoothNormals.cpp
+
+    // We need additional padding in the header of adjacencyData because glsl compute seems to still
+    // execute all branch not taken code, and just not write anything. If we don't have padding then
+    // some gpu threads will see a valance in the thousands and take forever to run and/or crash
+    // accessing memory out of bounds.
+
+    const int* adjacencyData = _adjacencyBufferCPU.get();
+    int        numVertex = adjacencyData[0] / 2; // two entries per vertex.
+    size_t     localWorkSize = 256;
+    size_t     paddingSize = (localWorkSize - numVertex % localWorkSize) * 2;
+
+    size_t adjacencyBufferSize = _adjacencyBufferSize + paddingSize;
+    size_t vertexDataSize = (numVertex * 2);
+    int*   vertexDataStart = const_cast<int*>(adjacencyData);
+    size_t vertexIdSize = _adjacencyBufferSize - vertexDataSize;
+    int*   vertexIdStart = vertexDataStart + vertexDataSize;
+
+    const MHWRender::MVertexBufferDescriptor intArrayDesc(
+        "", MHWRender::MGeometry::kColor, MHWRender::MGeometry::kInt32, 1);
+    _adjacencyBufferGPU.reset(new MHWRender::MVertexBuffer(intArrayDesc));
+    void* bufferData = _adjacencyBufferGPU->acquire(adjacencyBufferSize, true);
+
+    for (int i = 0; i < vertexDataSize; i += 2) {
+        vertexDataStart[i] += paddingSize;
+    }
+
+    // copy the vertex data information into the new padded buffer
+    int* destination = ((int*)bufferData);
+    memcpy(bufferData, vertexDataStart, vertexDataSize * sizeof(int));
+    // set the padding space to be zeros
+    destination = destination + vertexDataSize;
+    memset(destination, 0, paddingSize * sizeof(int));
+    // copy the adjacency information for each vertex into the upper part of the buffer.
+    destination = destination + paddingSize;
+    memcpy(destination, vertexIdStart, vertexIdSize * sizeof(int));
+
+    // commit the adjacency information
+    _adjacencyBufferGPU->commit(bufferData);
+
+    // prepare the remapping array from the regular topology to the render topology
+    _renderingToSceneFaceVtxIdsGPU.reset(new MHWRender::MVertexBuffer(intArrayDesc));
+    bufferData = _renderingToSceneFaceVtxIdsGPU->acquire(
+        _meshSharedData->_renderingToSceneFaceVtxIds.size(), true);
+    memcpy(
+        bufferData,
+        _meshSharedData->_renderingToSceneFaceVtxIds.data(),
+        _meshSharedData->_renderingToSceneFaceVtxIds.size() * sizeof(int));
+    _renderingToSceneFaceVtxIdsGPU->commit(bufferData);
+
+    _sceneToRenderingFaceVtxIdsGPU.reset(new MHWRender::MVertexBuffer(intArrayDesc));
+    bufferData = _sceneToRenderingFaceVtxIdsGPU->acquire(
+        _meshSharedData->_sceneToRenderingFaceVtxIds.size(), true);
+    memcpy(
+        bufferData,
+        _meshSharedData->_sceneToRenderingFaceVtxIds.data(),
+        _meshSharedData->_sceneToRenderingFaceVtxIds.size() * sizeof(int));
+    _sceneToRenderingFaceVtxIdsGPU->commit(bufferData);
+#endif
+}
+
+void MeshViewportCompute::prepareUniformBufferForNormals()
+{
+    if (0 != _uboResourceHandle)
+        return;
+
+    if (hasOpenGL()) {
+        glGenBuffers(1, &_uboResourceHandle);
+        glBindBuffer(GL_UNIFORM_BUFFER, _uboResourceHandle);
+        glBufferData(GL_UNIFORM_BUFFER, sizeof(unsigned int), &_vertexCount, GL_STATIC_DRAW);
+        glBindBuffer(GL_UNIFORM_BUFFER, 0);
+    }
+}
+
+bool MeshViewportCompute::hasOpenGL()
+{
+    // test an arbitrary OpenGL function pointer and make sure it is not nullptr
+    return nullptr != glBindBufferBase;
+}
+
+void MeshViewportCompute::initializeOpenGL() { glewInit(); }
+
+void MeshViewportCompute::compileNormalsProgram()
+{
+#if defined(HDVP2_OPENGL_NORMALS)
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:compileNormalsProgram");
+
+    std::string   computeShaderSource = _GetResourcePath("computeNormals.glsl");
+    std::ifstream glslFile(computeShaderSource.c_str());
+    std::string   glslString;
+    glslFile.seekg(0, std::ios::end);
+    glslString.reserve(glslFile.tellg());
+    glslFile.seekg(0, std::ios::beg);
+
+    glslString.assign((std::istreambuf_iterator<char>(glslFile)), std::istreambuf_iterator<char>());
+
+    if (hasOpenGL()) {
+        initializeOpenGL();
+    }
+    TF_VERIFY(hasOpenGL());
+
+    _computeNormalsProgram = new PxrMayaGLSLProgram;
+    _computeNormalsProgram->CompileShader(GL_COMPUTE_SHADER, glslString);
+    _computeNormalsProgram->Link();
+    _computeNormalsProgram->Validate();
+    openGLErrorCheck();
+#endif
+}
+
+void MeshViewportCompute::computeNormals()
+{
+#if defined(HDVP2_OPENGL_NORMALS)
+
+    if (!_normalVertexBufferGPUDirty)
+        return;
+    _normalVertexBufferGPUDirty = false;
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:computeNormals");
+    GLuint* adjacencyBufferResourceHandle = (GLuint*)_adjacencyBufferGPU->resourceHandle();
+
+    std::call_once(_compileProgramOnce, MeshViewportCompute::compileNormalsProgram);
+
+    GLuint programId = _computeNormalsProgram->GetProgramId();
+
+    // We already did another lambda task that did the commit for _positionsBuffer, so we should be
+    // able to get the resource handle.
+    GLuint* positionBufferResourceHandle = (GLuint*)_positionVertexBufferGPU->resourceHandle();
+
+    // normal buffer needs to be locked because we are modifying it. We don't want the CPU version
+    // and GPU version of the buffer to hold different data. Locking the buffer deletes the CPU
+    // version of the buffer.
+    _normalVertexBufferGPU->lockResourceHandle();
+    GLuint* normalBufferResourceHandle = (GLuint*)_normalVertexBufferGPU->resourceHandle();
+
+    // remapping buffers
+    GLuint* renderingToSceneFaceVtxIdsResourceHandle
+        = (GLuint*)_renderingToSceneFaceVtxIdsGPU->resourceHandle();
+    GLuint* sceneToRenderingFaceVtxIdsResourceHandle
+        = (GLuint*)_sceneToRenderingFaceVtxIdsGPU->resourceHandle();
+
+    if (hasOpenGL()) {
+        glBindBufferBase(GL_UNIFORM_BUFFER, 0, _uboResourceHandle);
+        openGLErrorCheck();
+
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, *positionBufferResourceHandle);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, *adjacencyBufferResourceHandle);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, *renderingToSceneFaceVtxIdsResourceHandle);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, *sceneToRenderingFaceVtxIdsResourceHandle);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, *normalBufferResourceHandle);
+        openGLErrorCheck();
+
+        size_t localWorkSize = 256;
+        size_t globalWorkSize = (localWorkSize - _vertexCount % localWorkSize) + _vertexCount;
+        size_t num_groups = globalWorkSize / localWorkSize;
+
+        glUseProgram(programId);
+        glDispatchCompute(num_groups, 1, 1);
+        glUseProgram(0);
+        openGLErrorCheck();
+
+        glBindBufferBase(GL_UNIFORM_BUFFER, 0, 0);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, 0);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, 0);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, 0);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, 0);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, 0);
+        openGLErrorCheck();
+    }
+    _normalVertexBufferGPU->unlockResourceHandle();
+#elif defined(HDVP2_OPENCL_NORMALS)
+
+    // Make shared buffers
+    std::vector<cl_mem> sharedBuffers;
+    cl_int              err;
+
+    {
+        MProfilingScope subProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L2,
+            "MeshViewportCompute:copyAdjacencyToOpenCL");
+        for (unsigned int i = 0; i < consolidatedItems.size(); i++) {
+            _renderItemData& _renderItemData = _renderItemDatas[i];
+            MRenderItem*     renderItem = consolidatedItems[i].get();
+
+            {
+                _meshSharedData->_openCLMeshInfo._adjacencyBufferCL.reset(new MOpenCLBuffer(
+                    MOpenCLInfo::getOpenCLContext(),
+                    CL_MEM_READ_ONLY, // the READ and WRITE flags are from the point of view of an
+                                      // OpenCL Kernel.
+                    _adjacencyBufferSize * sizeof(int),
+                    (void*)_adjacencyBufferCPU,
+                    &err));
+                MOpenCLInfo::checkCLErrorStatus(err);
+            }
+        }
+    }
+
+    {
+        MProfilingScope subProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L2,
+            "MeshViewportCompute:attachToGLBuffers");
+        for (unsigned int i = 0; i < consolidatedItems.size(); i++) {
+            _renderItemData& _renderItemData = _renderItemDatas[i];
+            MRenderItem*     renderItem = consolidatedItems[i].get();
+            {
+
+                _meshSharedData->_openCLMeshInfo._positionsBufferShared.attach(clCreateFromGLBuffer(
+                    MOpenCLInfo::getOpenCLContext(),
+                    CL_MEM_READ_ONLY,
+                    *(cl_GLuint*)_positionVertexBufferGPU->resourceHandle(),
+                    &err));
+                MOpenCLInfo::checkCLErrorStatus(err);
+                sharedBuffers.push_back(
+                    _meshSharedData->_openCLMeshInfo._positionsBufferShared.get());
+
+                _meshSharedData->_openCLMeshInfo._normalsBufferShared.attach(clCreateFromGLBuffer(
+                    MOpenCLInfo::getOpenCLContext(),
+                    CL_MEM_WRITE_ONLY,
+                    *(cl_GLuint*)_normalVertexBufferGPU->resourceHandle(),
+                    &err)); // from the point of view of OpenCL!!!
+                MOpenCLInfo::checkCLErrorStatus(err);
+                sharedBuffers.push_back(
+                    _meshSharedData->_openCLMeshInfo._normalsBufferShared.get());
+            }
+        }
+    }
+
+    // acquire the shared buffers
+    MAutoCLEvent acquireEvent;
+    {
+        MProfilingScope subProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L2,
+            "MeshViewportCompute:acquireSharedBuffers");
+#ifdef MAYA_BLOCKING_OPENCL
+        err = clEnqueueAcquire3DObjects_blocking_api(
+            MOpenCLInfo::getMayaDefaultOpenCLCommandQueue(),
+            sharedBuffers.size(),
+            sharedBuffers.data(),
+            0,
+            nullptr,
+            acquireEvent.getReferenceForAssignment());
+#else
+        err = clEnqueueAcquireGLObjects(
+            MOpenCLInfo::getMayaDefaultOpenCLCommandQueue(),
+            sharedBuffers.size(),
+            sharedBuffers.data(),
+            0,
+            nullptr,
+            acquireEvent.getReferenceForAssignment());
+#endif
+    }
+    MOpenCLInfo::checkCLErrorStatus(err);
+
+    // get the kernel
+    MString kernelFile("C:/dev/usd/ecg-maya-usd/maya-usd/lib/mayaUsd/render/vp2RenderDelegate/"
+                       "mesh.cl"); // needs to get installed somewhere
+    MString kernelName("computeNormals");
+    MAutoCLKernel computeNormalsKernel(MOpenCLInfo::getOpenCLKernel(kernelFile, kernelName));
+
+    // compute the work group size and global work size
+    size_t workGroupSize;
+    size_t retSize;
+    err = clGetKernelWorkGroupInfo(
+        computeNormalsKernel.get(),
+        MOpenCLInfo::getOpenCLDeviceId(),
+        CL_KERNEL_WORK_GROUP_SIZE,
+        sizeof(size_t),
+        &workGroupSize,
+        &retSize);
+    MOpenCLInfo::checkCLErrorStatus(err);
+    size_t localWorkSize = 256;
+    if (retSize > 0) {
+        localWorkSize = workGroupSize;
+    }
+
+    cl_event* events = new cl_event[consolidatedItems.size()];
+    {
+        MProfilingScope subProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L2,
+            "MeshViewportCompute:enqueueKernels");
+        for (unsigned int i = 0; i < consolidatedItems.size(); i++) {
+            _renderItemData& _renderItemData = _renderItemDatas[i];
+            MRenderItem*     renderItem = consolidatedItems[i].get();
+
+            size_t remain = _vertexCount % localWorkSize;
+            size_t globalWorkSize = _vertexCount;
+            if (remain) {
+                globalWorkSize = _vertexCount + localWorkSize - remain;
+            }
+
+            // set up the compute kernel!  This can be mostly copied from
+            // CommitOpenCLPositions::operator()(), just integrate the new data handles we got
+            // above.
+
+            // set kernel args
+            err = clSetKernelArg(
+                computeNormalsKernel.get(),
+                0,
+                sizeof(cl_mem),
+                (const void*)
+                    _meshSharedData->_openCLMeshInfo._positionsBufferShared.getReadOnlyRef());
+            MOpenCLInfo::checkCLErrorStatus(err);
+            err = clSetKernelArg(
+                computeNormalsKernel.get(), 1, sizeof(cl_uint), (const void*)&_vertexCount);
+            MOpenCLInfo::checkCLErrorStatus(err);
+            err = clSetKernelArg(
+                computeNormalsKernel.get(),
+                2,
+                sizeof(cl_mem),
+                (const void*)_meshSharedData->_openCLMeshInfo._adjacencyBufferCL->buffer()
+                    .getReadOnlyRef());
+            MOpenCLInfo::checkCLErrorStatus(err);
+            err = clSetKernelArg(
+                computeNormalsKernel.get(),
+                3,
+                sizeof(cl_mem),
+                (const void*)
+                    _meshSharedData->_openCLMeshInfo._normalsBufferShared.getReadOnlyRef());
+            MOpenCLInfo::checkCLErrorStatus(err);
+
+            // run kernel
+            err = clEnqueueNDRangeKernel(
+                MOpenCLInfo::getMayaDefaultOpenCLCommandQueue(),
+                computeNormalsKernel.get(),
+                1,
+                nullptr,
+                &globalWorkSize,
+                &localWorkSize,
+                1,
+                acquireEvent.getReadOnlyRef(),
+                _meshSharedData->_openCLMeshInfo._normalsBufferReady.getReferenceForAssignment());
+            MOpenCLInfo::checkCLErrorStatus(err);
+
+            // build a list of all the kernel complete events
+            events[i] = _meshSharedData->_openCLMeshInfo._normalsBufferReady.get();
+        }
+    }
+
+    // release the shared buffers
+    MAutoCLEvent releaseEvent;
+    {
+        MProfilingScope subProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L2,
+            "MeshViewportCompute:releaseSharedBuffers");
+#ifdef MAYA_BLOCKING_OPENCL
+        clEnqueueRelease3DObjects_blocking_api(
+            MOpenCLInfo::getMayaDefaultOpenCLCommandQueue(),
+            sharedBuffers.size(),
+            sharedBuffers.data(),
+            1,
+            _meshSharedData->_openCLMeshInfo._normalsBufferReady.getReadOnlyRef(),
+            releaseEvent.getReferenceForAssignment());
+#else
+        clEnqueueReleaseGLObjects(
+            MOpenCLInfo::getMayaDefaultOpenCLCommandQueue(),
+            sharedBuffers.size(),
+            sharedBuffers.data(),
+            consolidatedItems.size(),
+            events,
+            releaseEvent.getReferenceForAssignment());
+#endif
+    }
+    {
+        MProfilingScope subProfilingScope(
+            HdVP2RenderDelegate::sProfilerCategory,
+            MProfiler::kColorD_L3,
+            "MeshViewportCompute:syncOpenCL");
+        DoGLWaitSync(releaseEvent.get());
+    }
+    delete[] events;
+
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L2,
+        "MeshViewportCompute:releaseOpenCLBuffers");
+
+    for (unsigned int i = 0; i < consolidatedItems.size(); i++) {
+        _renderItemData& _renderItemData = _renderItemDatas[i];
+        MRenderItem*     renderItem = consolidatedItems[i].get();
+
+        _meshSharedData->_openCLMeshInfo._positionsBufferShared.reset();
+        _meshSharedData->_openCLMeshInfo._normalsBufferShared.reset();
+        _meshSharedData->_openCLMeshInfo._adjacencyBufferCL->reset();
+        _meshSharedData->_openCLMeshInfo._normalsBufferReady.reset();
+    }
+
+    // clFinish(MOpenCLInfo::getMayaDefaultOpenCLCommandQueue());
+
+#elif defined(DO_CPU_NORMALS)
+
+#endif
+}
+
+void MeshViewportCompute::computeOSD()
+{
+#if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
+    MProfilingScope subProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory, MProfiler::kColorD_L2, "MeshViewportCompute:doOSD");
+    // Inspired by HdSt_Osd3TopologyComputation::Resolve()
+
+    // refine
+    //  and
+    // create stencil/patch table
+    OpenSubdiv::Far::StencilTable const* consolidatedVertexStencils = _vertexStencils.get();
+    OpenSubdiv::Far::StencilTable const* consolidatedVaryingStencils = _varyingStencils.get();
+    OpenSubdiv::Far::PatchTable const*   consolidatedPatchTable = _patchTable.get();
+#endif
+#if defined(DO_CPU_OSD)
+
+    class OsdCPUBuffer
+    {
+    public:
+        OsdCPUBuffer(float* buffer)
+            : _buffer(buffer)
+        {
+        }
+
+        float* BindCpuBuffer() { return _buffer; }
+
+    private:
+        float* _buffer { nullptr };
+    };
+
+    // smooth the normals
+    const MVertexBufferDescriptor& normalsDescriptor = _normalVertexBufferGPU->descriptor();
+    int                            dimension = normalsDescriptor.dimension();
+    void*                          normalsBufferData = _normalVertexBufferGPU->acquire(
+        _vertexCount + consolidatedVertexStencils->GetNumStencils(), false);
+    OpenSubdiv::Osd::BufferDescriptor normalSrcDesc(0, dimension, dimension);
+    OpenSubdiv::Osd::BufferDescriptor normalDstDesc(_vertexCount * dimension, dimension, dimension);
+    OsdCPUBuffer                      osdNormalVertexBuffer((float*)normalsBufferData);
+    OpenSubdiv::Osd::CpuEvaluator::
+        EvalStencils<OsdCPUBuffer, OsdCPUBuffer, OpenSubdiv::Far::StencilTable>(
+            &osdNormalVertexBuffer,
+            normalSrcDesc,
+            &osdNormalVertexBuffer,
+            normalDstDesc,
+            consolidatedVertexStencils);
+    _normalVertexBufferGPU->commit(normalsBufferData);
+
+    // smooth the positions
+    const MVertexBufferDescriptor& positionsDescriptor = _positionVertexBufferGPU->descriptor();
+    dimension = positionsDescriptor.dimension();
+    void* positionsBufferData = _positionVertexBufferGPU->acquire(
+        _vertexCount + consolidatedVertexStencils->GetNumStencils(), false);
+    OpenSubdiv::Osd::BufferDescriptor positionSrcDesc(0, dimension, dimension);
+    OpenSubdiv::Osd::BufferDescriptor positionDstDesc(
+        _vertexCount * dimension, dimension, dimension);
+    OsdCPUBuffer osdPositionVertexBuffer((float*)positionsBufferData);
+    OpenSubdiv::Osd::CpuEvaluator::
+        EvalStencils<OsdCPUBuffer, OsdCPUBuffer, OpenSubdiv::Far::StencilTable>(
+            &osdPositionVertexBuffer,
+            positionSrcDesc,
+            &osdPositionVertexBuffer,
+            positionDstDesc,
+            consolidatedVertexStencils);
+    _positionVertexBufferGPU->commit(positionsBufferData);
+#elif defined(DO_OPENGL_OSD)
+
+    class OsdGLBuffer
+    {
+    public:
+        OsdGLBuffer(GLuint resourceId, size_t dimension)
+            : _dimension(dimension)
+            , _resourceId(resourceId)
+        {
+        }
+
+        size_t GetNumElements() const { return _dimension; }
+        GLuint BindVBO() { return _resourceId; }
+
+    private:
+        size_t _dimension { 0 };
+        GLuint _resourceId;
+    };
+
+    // Smooth the positions
+    // ensure the position buffer is large enough to hold the smoothed result
+    const MVertexBufferDescriptor& positionsDescriptor = _positionVertexBufferGPU->descriptor();
+    int                            dimension = positionsDescriptor.dimension();
+    void*                          positionsBufferData = _positionVertexBufferGPU->acquire(
+        _vertexCount + consolidatedVertexStencils->GetNumStencils(), false);
+    _positionVertexBufferGPU->commit(positionsBufferData);
+
+    // BufferDescriptor is meant to describe an interleaved buffer, but we are not interleaved, so
+    // it looks dumb
+    OpenSubdiv::Osd::BufferDescriptor positionSrcDesc(0, dimension, dimension);
+    OpenSubdiv::Osd::BufferDescriptor positionDstDesc(
+        _vertexCount * dimension, dimension, dimension);
+
+    static OpenSubdiv::Osd::EvaluatorCacheT<OpenSubdiv::Osd::GLComputeEvaluator> evaluatorCache;
+    OpenSubdiv::Osd::GLComputeEvaluator const*                                   positionInstance
+        = OpenSubdiv::Osd::GetEvaluator<OpenSubdiv::Osd::GLComputeEvaluator>(
+            &evaluatorCache, positionSrcDesc, positionDstDesc, (void*)NULL);
+
+    OsdGLBuffer osdPositionBuffer(
+        *((GLuint*)(_positionVertexBufferGPU->resourceHandle())), dimension);
+    OpenSubdiv::Osd::GLStencilTableSSBO* gpuStencilTable
+        = OpenSubdiv::Osd::GLStencilTableSSBO::Create(consolidatedVertexStencils, nullptr);
+    positionInstance->EvalStencils(
+        &osdPositionBuffer, positionSrcDesc, &osdPositionBuffer, positionDstDesc, gpuStencilTable);
+
+    // ensure the normal buffer is large enough to hold the smoothed result
+    const MVertexBufferDescriptor& normalsDescriptor = _normalVertexBufferGPU->descriptor();
+    dimension = normalsDescriptor.dimension();
+    void* normalsBufferData = _normalVertexBufferGPU->acquire(
+        _vertexCount + consolidatedVertexStencils->GetNumStencils(), false);
+    _normalVertexBufferGPU->commit(normalsBufferData);
+
+    // BufferDescriptor is meant to describe an interleaved buffer, but we are not interleaved, so
+    // it looks dumb
+    OpenSubdiv::Osd::BufferDescriptor normalSrcDesc(0, dimension, dimension);
+    OpenSubdiv::Osd::BufferDescriptor normalDstDesc(_vertexCount * dimension, dimension, dimension);
+
+    OpenSubdiv::Osd::GLComputeEvaluator const* normalInstance
+        = OpenSubdiv::Osd::GetEvaluator<OpenSubdiv::Osd::GLComputeEvaluator>(
+            &evaluatorCache, normalSrcDesc, normalDstDesc, (void*)NULL);
+
+    OsdGLBuffer osdNormalBuffer(*((GLuint*)(_normalVertexBufferGPU->resourceHandle())), dimension);
+    normalInstance->EvalStencils(
+        &osdNormalBuffer, normalSrcDesc, &osdNormalBuffer, normalDstDesc, gpuStencilTable);
+
+    if (_colorVertexBufferGPU) {
+        // ensure the color buffer is large enough to hold the smoothed result
+        const MVertexBufferDescriptor& colorsDescriptor = _colorVertexBufferGPU->descriptor();
+        dimension = colorsDescriptor.dimension();
+        void* ColorsBufferData = _colorVertexBufferGPU->acquire(
+            _vertexCount + consolidatedVertexStencils->GetNumStencils(), false);
+        _colorVertexBufferGPU->commit(ColorsBufferData);
+
+        // BufferDescriptor is meant to describe an interleaved buffer, but we are not interleaved,
+        // so it looks dumb
+        OpenSubdiv::Osd::BufferDescriptor ColorSrcDesc(0, dimension, dimension);
+        OpenSubdiv::Osd::BufferDescriptor ColorDstDesc(
+            _vertexCount * dimension, dimension, dimension);
+
+        OpenSubdiv::Osd::GLComputeEvaluator const* colorInstance
+            = OpenSubdiv::Osd::GetEvaluator<OpenSubdiv::Osd::GLComputeEvaluator>(
+                &evaluatorCache, ColorSrcDesc, ColorDstDesc, (void*)NULL);
+
+        OsdGLBuffer osdColorBuffer(
+            *((GLuint*)(_colorVertexBufferGPU->resourceHandle())), dimension);
+        colorInstance->EvalStencils(
+            &osdColorBuffer, ColorSrcDesc, &osdColorBuffer, ColorDstDesc, gpuStencilTable);
+    }
+#endif
+}
+
+void MeshViewportCompute::setClean()
+{
+    // when we perform consolidated compute the dirty flags for the source items remain dirty.
+    // Each individual source item isn't capable of drawing unconsolidated, so the flags must
+    // remain dirty to guard against potentially drawing unconsolidated & requiring the compute
+    // to execute.
+
+    _topologyDirty = false;
+    _adjacencyBufferGPUDirty = false;
+    _normalVertexBufferGPUDirty = false;
+    _executed = true;
+}
+
+bool MeshViewportCompute::execute(
+    const MPxViewportComputeItem::Actions& availableActions,
+    MRenderItem&                           renderItem)
+{
+    if (!_normalVertexBufferGPUDirty)
+        return true;
+
+    if (_adjacencyTaskInProgress)
+        return false;
+
+    MProfilingScope mainProfilingScope(
+        HdVP2RenderDelegate::sProfilerCategory,
+        MProfiler::kColorD_L1,
+        "MeshViewportCompute::execute");
+
+    findConsolidationMapping(renderItem);
+
+    if (_topologyDirty || _adjacencyBufferSize == 0) {
+        TF_VERIFY(!_adjacencyTaskInProgress);
+        _adjacencyTaskInProgress = true;
+        EnqueueLambdaTask([this] {
+            this->createConsolidatedTopology(getSceneTopology);
+            this->createConsolidatedTopology(getRenderingTopology);
+            this->createConsolidatedAdjacency();
+            this->_adjacencyTaskInProgress = false;
+        });
+        return false;
+    }
+
+    findRenderGeometry(renderItem);
+
+    createConsolidatedOSDTables(renderItem); // disabled by preprocessor macros
+
+    findVertexBuffers(renderItem);
+
+    prepareAdjacencyBuffer();
+
+    prepareUniformBufferForNormals();
+
+    computeNormals();
+
+    computeOSD(); // disabled by preprocessor macros
+
+    setClean();
+
+    return true;
+}
+
+bool MeshViewportCompute::canConsolidate(const MPxViewportComputeItem& other) const
+{
+    const MeshViewportCompute* otherMeshViewportCompute
+        = dynamic_cast<const MeshViewportCompute*>(&other);
+    if (nullptr == otherMeshViewportCompute)
+        return false;
+
+    // If the compute has executed then the data to be consolidated will already
+    // be smoothed. Smoothed items can only consolidate with other smoothed items.
+    return hasExecuted() == otherMeshViewportCompute->hasExecuted()
+#if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
+        && _adaptive == otherMeshViewportCompute->_adaptive
+        && _level == otherMeshViewportCompute->_level
+        && _meshSharedData->_renderingTopology.GetScheme()
+        == otherMeshViewportCompute->_meshSharedData->_renderingTopology.GetScheme()
+#endif
+        ;
+}
+
+MSharedPtr<MPxViewportComputeItem> MeshViewportCompute::cloneForConsolidation() const
+{
+    MSharedPtr<MeshViewportCompute> clone
+        = MSharedPtr<MeshViewportCompute>::make<>(std::make_shared<HdVP2MeshSharedData>(), nullptr);
+    return clone;
+}
+
+PXR_NAMESPACE_CLOSE_SCOPE
+
+#endif
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
new file mode 100644
index 0000000000..47119ca98b
--- /dev/null
+++ b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
@@ -0,0 +1,261 @@
+//
+// Copyright 2020 Autodesk
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef HD_VP2_MESHVIEWPORTCOMPUTE
+#define HD_VP2_MESHVIEWPORTCOMPUTE
+
+#include <maya/MTypes.h> // for MAYA_API_VERSION
+
+/*
+    GPU Compute Prototype
+
+    The GPU evaluation prototype is two separate parts, the normal calculation code and the OSD
+    code.
+
+    The normal calculation code is enabled by setting HDVP2_USE_GPU_NORMAL_COMPUTATION=1 at runtime.
+    The normal calculation code is close to being stable enough for general use, but hasn't had
+    enough polish to enable by default.
+
+    The OSD code requires the normal calculation code to be enabled to use. OSD is enabled
+    by compiling with HDVP2_ENABLE_GPU_OSD. The OSD code is much less stable then the normals
+    calculation code and comes with a number of huge
+
+    OSD Limitations:
+     * No OSD adaptive support
+     * scenes with animation behave poorly
+     * selection in the viewport is very slow
+     * toggling VP2 consolidation world off and on will cause crashes
+     * some objects draw with incorrect indexing
+*/
+
+#if MAYA_API_VERSION >= 20210000 \
+    && !defined(OSMac_) // OSX doesn't have OpenGL 4.3 support necessary for compute
+#define HDVP2_ENABLE_GPU_COMPUTE
+#endif
+
+#ifdef HDVP2_ENABLE_GPU_COMPUTE
+/*
+    GPU OSD computation implementation is experimental.
+*/
+//#define HDVP2_ENABLE_GPU_OSD
+#ifdef HDVP2_ENABLE_GPU_OSD
+#define DO_OPENGL_OSD
+//#define DO_CPU_OSD
+#endif
+
+#include <pxr/imaging/hd/mesh.h>
+#include <pxr/pxr.h>
+
+#include <maya/MHWGeometry.h>
+#include <maya/MSharedPtr.h>
+
+#ifdef HDVP2_ENABLE_GPU_OSD
+#include <opensubdiv/far/_patchTable.h>
+#include <opensubdiv/far/stencilTable.h>
+#endif
+
+#include <mayaUsd/render/vp2RenderDelegate/proxyRenderDelegate.h>
+
+#include <climits>
+
+#define HDVP2_OPENGL_NORMALS
+/*
+    OpenCL Normals calculation is experimental
+*/
+//#define HDVP2_OPENCL_NORMALS
+#ifdef HDVP2_OPENCL_NORMALS
+#include <clew/clew.h>
+#endif
+#ifdef HDVP2_OPENGL_NORMALS
+//#define DO_OPENGL_ERROR_CHECK
+// clang-format wants to re-order these two includes but they must be done in this order or
+// the code will not compile.
+// clang-format off
+#include "pxr/imaging/glf/glew.h"   // needs to be included before anything else includes gl.h
+#include "../px_vp20/glslProgram.h" // this includes gl.h and not glew.
+//clang-format on
+#endif
+
+#include <maya/MPxViewportComputeItem.h>
+
+#ifdef HDVP2_OPENCL_NORMALS
+#include <maya/MOpenCLAutoPtr.h>
+#include <maya/MOpenCLBuffer.h>
+#endif
+
+#include <pxr/base/plug/plugin.h>
+#include <pxr/base/plug/thisPlugin.h>
+#include <pxr/base/tf/diagnostic.h>
+#include <pxr/base/tf/getenv.h>
+
+#ifdef HDVP2_ENABLE_GPU_OSD
+#include <pxr/imaging/pxOsd/refinerFactory.h>
+#include <pxr/imaging/pxOsd/tokens.h>
+
+#include <opensubdiv/far/_patchTable.h>
+#include <opensubdiv/far/patchTableFactory.h>
+#include <opensubdiv/far/stencilTable.h>
+#include <opensubdiv/far/stencilTableFactory.h>
+#include <opensubdiv/osd/cpuEvaluator.h>
+#include <opensubdiv/osd/cpuVertexBuffer.h>
+#include <opensubdiv/osd/glComputeEvaluator.h>
+#include <opensubdiv/osd/glVertexBuffer.h>
+#include <opensubdiv/osd/mesh.h>
+#include <opensubdiv/version.h>
+#endif
+
+#include <tbb/task.h>
+
+#include <fstream>
+#include <streambuf>
+#include <string>
+
+PXR_NAMESPACE_OPEN_SCOPE
+
+struct HdVP2MeshSharedData;
+class HdVP2DrawItem;
+class PxrMayaGLSLProgram;
+
+/*! \brief  HdVP2Mesh-specific compute class for evaluating geometry streams and OSD
+    \class  MeshViewportCompute
+
+    A mesh can have shader stream requires (such as normals, tangents etc) and OSD
+    requirements. This class executes GPU compute kernels to fulfill the geometry
+    requirements of an HdVP2Mesh.
+
+    A key performance feature of MeshViewportCompute is MRenderItems from different
+    HdVP2Mesh objects which have the same compute requirements can be consolidated
+    together in consolidated world and have their compute executed in a single
+    consolidated compute kernel, rather than issuing a compute kernel per unconsolidated
+    render item.
+*/
+class MeshViewportCompute : public MPxViewportComputeItem
+{
+private:
+    std::shared_ptr<HdVP2MeshSharedData> _meshSharedData;
+    const void* _drawItem { nullptr }; // only set for a consolidation source, never dereferenced
+    bool        _executed { false };   // Has this compute been executed
+    bool        _sourcesExecuted {
+        false
+    }; // Have the source compute's executed. only valid for a consolidated compute
+
+    MSharedPtr<MeshViewportCompute> _consolidatedCompute;
+
+    std::unique_ptr<MHWRender::MGeometryIndexMapping> _geometryIndexMapping;
+    unsigned int                                      _vertexCount { 0 };
+    GLuint                                            _uboResourceHandle { 0 };
+
+    // adjacency information for normals
+    size_t                                    _adjacencyBufferSize { 0 };
+    std::unique_ptr<const int>                _adjacencyBufferCPU;
+    std::unique_ptr<MHWRender::MVertexBuffer> _adjacencyBufferGPU;
+    std::unique_ptr<MHWRender::MVertexBuffer> _renderingToSceneFaceVtxIdsGPU;
+    std::unique_ptr<MHWRender::MVertexBuffer> _sceneToRenderingFaceVtxIdsGPU;
+
+    // Geometry information
+    MGeometry* fRenderGeom { nullptr };
+    // Buffers in MGeometry.
+    MVertexBuffer* _positionVertexBufferGPU { nullptr }; // not owned by *this, owned by fRenderGeom
+    MVertexBuffer* _normalVertexBufferGPU { nullptr };   // not owned by *this, owned by fRenderGeom
+    MVertexBuffer* _colorVertexBufferGPU { nullptr };    // not owned by *this, owned by fRenderGeom
+
+    bool _adjacencyTaskInProgress { false };z
+    bool _topologyDirty { true };           // sourceMeshSharedData->_renderingTopology has changed
+    bool _adjacencyBufferGPUDirty { true }; //_adjacencyBufferGPU is dirty
+    bool _normalVertexBufferGPUDirty { true }; //_normalVertexBufferGPU is dirty
+
+#if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
+    // OSD information
+    std::unique_ptr<OpenSubdiv::Far::StencilTable const> _vertexStencils;
+    std::unique_ptr<OpenSubdiv::Far::StencilTable const> _varyingStencils;
+    std::unique_ptr<OpenSubdiv::Far::PatchTable const>   _patchTable;
+    bool                                                 _adaptive { false };
+    int                                                  _level { 1 };
+#endif
+
+#if defined(HDVP2_OPENGL_NORMALS)
+    static std::once_flag      _compileProgramOnce;
+    static PxrMayaGLSLProgram* _computeNormalsProgram;
+#endif
+
+#if defined(HDVP2_OPENCL_NORMALS)
+    //!< pure OpenCL version of the positions buffer
+    std::unique_ptr<MOpenCLBuffer> _positionsBufferCL;
+
+    //!< A shared CL-GL version of _positionsBuffer
+    MAutoCLMem _positionsBufferShared;
+
+    // data to compute smooth normals
+    std::unique_ptr<MOpenCLBuffer> _adjacencyBufferCL;
+    MAutoCLMem                     _normalsBufferShared;
+    MAutoCLEvent                   _normalsBufferReady;
+#endif
+
+    static void openGLErrorCheck();
+    static bool hasOpenGL();
+    static void initializeOpenGL();
+    bool        hasExecuted() const;
+    void        reset();
+    void        findConsolidationMapping(MRenderItem& renderItem);
+    template <typename TopologyAccessor>
+    void        createConsolidatedTopology(TopologyAccessor getTopology);
+    void        createConsolidatedAdjacency();
+    void        findRenderGeometry(MRenderItem& renderItem);
+    void        createConsolidatedOSDTables(MRenderItem& renderItem);
+    void        findVertexBuffers(MRenderItem& renderItem);
+    void        prepareAdjacencyBuffer();
+    void        prepareUniformBufferForNormals();
+    static void compileNormalsProgram();
+    void        computeNormals();
+    void        computeOSD();
+    void        setClean();
+
+public:
+    MeshViewportCompute(std::shared_ptr<HdVP2MeshSharedData> meshSharedData, const void* drawItem)
+        : MPxViewportComputeItem(false)
+        , _meshSharedData(meshSharedData)
+        , _drawItem(drawItem)
+    {
+        setRequiredAction(MPxViewportComputeItem::kAccessVirtualDevice, true);
+        setRequiredAction(MPxViewportComputeItem::kAccessConsolidation, true);
+        setRequiredAction(MPxViewportComputeItem::kModifyVertexBufferData, true);
+#if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
+        setRequiredAction(MPxViewportComputeItem::kModifyVertexBufferSize, true);
+        setRequiredAction(MPxViewportComputeItem::kModifyConsolidation, true);
+#endif
+    }
+
+    virtual ~MeshViewportCompute()
+    {
+        if (0 != _uboResourceHandle)
+            glDeleteBuffers(1, &_uboResourceHandle);
+    }
+
+    bool execute(const MPxViewportComputeItem::Actions& availableActions, MRenderItem& renderItem)
+        override;
+    bool canConsolidate(const MPxViewportComputeItem& other) const override;
+    MSharedPtr<MPxViewportComputeItem> cloneForConsolidation() const override;
+    bool                               verifyDrawItem(const HdVP2DrawItem& drawItem) const;
+
+    void setTopologyDirty();
+    void setAdjacencyBufferGPUDirty();
+    void setNormalVertexBufferGPUDirty();
+};
+
+PXR_NAMESPACE_CLOSE_SCOPE
+
+#endif
+
+#endif

From 65b16345b23f05161d69af8b601d447d2d80cbd8 Mon Sep 17 00:00:00 2001
From: krickw <william.krick@autodesk.com>
Date: Fri, 4 Dec 2020 12:48:31 -0500
Subject: [PATCH 2/5] Build fixes for versions of Maya where the
 MAYA_API_VERSION is 2020 or less.

---
 lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp              | 1 +
 lib/mayaUsd/render/vp2RenderDelegate/mesh.h                | 2 ++
 lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h | 3 +--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
index 32a4db7cec..ee38f51c8f 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
@@ -27,6 +27,7 @@
 #include <mayaUsd/utils/colorSpace.h>
 
 #include <pxr/base/gf/matrix4d.h>
+#include <pxr/base/tf/getenv.h>
 #include <pxr/imaging/hd/meshUtil.h>
 #include <pxr/imaging/hd/sceneDelegate.h>
 #include <pxr/imaging/hd/smoothNormals.h>
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
index 9249a64156..4003a35a20 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
@@ -18,6 +18,8 @@
 
 #include "meshViewportCompute.h"
 
+#include <mayaUsd/render/vp2RenderDelegate/proxyRenderDelegate.h>
+
 #include <pxr/imaging/hd/mesh.h>
 #include <pxr/pxr.h>
 
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
index 47119ca98b..8b8575b7d8 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
+++ b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
@@ -98,7 +98,6 @@
 #include <pxr/base/plug/plugin.h>
 #include <pxr/base/plug/thisPlugin.h>
 #include <pxr/base/tf/diagnostic.h>
-#include <pxr/base/tf/getenv.h>
 
 #ifdef HDVP2_ENABLE_GPU_OSD
 #include <pxr/imaging/pxOsd/refinerFactory.h>
@@ -171,7 +170,7 @@ class MeshViewportCompute : public MPxViewportComputeItem
     MVertexBuffer* _normalVertexBufferGPU { nullptr };   // not owned by *this, owned by fRenderGeom
     MVertexBuffer* _colorVertexBufferGPU { nullptr };    // not owned by *this, owned by fRenderGeom
 
-    bool _adjacencyTaskInProgress { false };z
+    bool _adjacencyTaskInProgress { false };
     bool _topologyDirty { true };           // sourceMeshSharedData->_renderingTopology has changed
     bool _adjacencyBufferGPUDirty { true }; //_adjacencyBufferGPU is dirty
     bool _normalVertexBufferGPUDirty { true }; //_normalVertexBufferGPU is dirty

From c970e7b811f1010216f7124800880d0baed8b3d4 Mon Sep 17 00:00:00 2001
From: krickw <william.krick@autodesk.com>
Date: Fri, 4 Dec 2020 14:14:05 -0500
Subject: [PATCH 3/5] Fix warnings which are errors on Linux and OSX.

---
 lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp  |  9 +++------
 lib/mayaUsd/render/vp2RenderDelegate/mesh.h    |  7 -------
 .../vp2RenderDelegate/meshViewportCompute.cpp  | 18 +++++++++---------
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
index ee38f51c8f..35e9c590d2 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
@@ -906,8 +906,7 @@ void HdVP2Mesh::_UpdateDrawItem(
 #ifdef HDVP2_ENABLE_GPU_OSD
     const bool isLineItem = (renderItem->primitive() == MHWRender::MGeometry::kLines);
     // when we do OSD we don't bother creating indexing until after we have a smooth mesh
-    const bool requiresIndexUpdate
-        = !isBBoxItem && !isPointSnappingItem && (!_gpuOSDEnabled || isLineItem);
+    const bool requiresIndexUpdate = !isBBoxItem && !isPointSnappingItem && isLineItem;
 #else
     const bool requiresIndexUpdate = !isBBoxItem && !isPointSnappingItem;
 #endif
@@ -934,7 +933,7 @@ void HdVP2Mesh::_UpdateDrawItem(
                     numIndex * sizeof(int));
 
 #ifdef HDVP2_ENABLE_GPU_COMPUTE
-            if (requireSmoothNormals && (_gpuNormalsEnabled || _gpuOSDEnabled)) {
+            if (requireSmoothNormals && _gpuNormalsEnabled) {
                 // these function only do something if HDVP2_ENABLE_GPU_COMPUTE or
                 // HDVP2_ENABLE_GPU_OSD is defined
                 _CreateViewportCompute(*drawItem);
@@ -976,7 +975,7 @@ void HdVP2Mesh::_UpdateDrawItem(
             prepareNormals = ((itemDirtyBits & HdChangeTracker::DirtyNormals) != 0);
         } else if (requireSmoothNormals && (itemDirtyBits & DirtySmoothNormals)) {
 #ifdef HDVP2_ENABLE_GPU_COMPUTE
-            if (_gpuNormalsEnabled || _gpuOSDEnabled) {
+            if (_gpuNormalsEnabled) {
                 if (!_meshSharedData->_viewportCompute) {
                     _CreateViewportCompute(*drawItem);
 #ifdef HDVP2_ENABLE_GPU_OSD
@@ -1717,8 +1716,6 @@ void HdVP2Mesh::_CreateViewportCompute(const HdVP2DrawItem& drawItem)
 void HdVP2Mesh::_CreateOSDTables()
 {
 #if defined(DO_CPU_OSD) || defined(DO_OPENGL_OSD)
-    if (!_gpuOSDEnabled)
-        return;
 
     assert(_meshSharedData->_viewportCompute);
     MProfilingScope subProfilingScope(
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
index 4003a35a20..d7196bcbcf 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
@@ -185,13 +185,6 @@ class HdVP2Mesh final : public HdMesh
     //! makes the expressions using these variables much simpler
     bool _gpuNormalsEnabled { true }; //!< Use GPU Compute for normal calculation, only used when
                                       //!< HDVP2_ENABLE_GPU_COMPUTE is defined
-    bool _gpuOSDEnabled {
-#ifdef HDVP2_ENABLE_GPU_OSD
-        true
-#else
-        false
-#endif
-    }; //!< Use GPU Compute for OSD, only used when HDVP2_ENABLE_GPU_OSD is defined
     static int _gpuNormalsComputeThreshold;
 };
 
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp
index 5cf0b049c5..cce9ee5eaf 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp
+++ b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.cpp
@@ -264,7 +264,9 @@ void MeshViewportCompute::createConsolidatedTopology(TopologyAccessor getTopolog
         for (int sourceIndex = 0; sourceIndex < _geometryIndexMapping->geometryCount();
              sourceIndex++) {
             MRenderItem* sourceItem = _geometryIndexMapping->sourceRenderItem(sourceIndex);
-            int consolidatedBufferVertexOffset = _geometryIndexMapping->vertexStart(sourceIndex);
+            int          vertexStart = _geometryIndexMapping->vertexStart(sourceIndex);
+            TF_VERIFY(vertexStart >= 0);
+            size_t                          consolidatedBufferVertexOffset = (size_t)vertexStart;
             MSharedPtr<MeshViewportCompute> sourceViewportComputeItem
                 = MSharedPtr<MeshViewportCompute>::dynamic_pointer_cast<>(
                     sourceItem->viewportComputeItem());
@@ -279,24 +281,24 @@ void MeshViewportCompute::createConsolidatedTopology(TopologyAccessor getTopolog
             TF_VERIFY(sourceTopology.GetRefineLevel() == refineLevel);
 
             VtIntArray const& sourceFaceVertexCounts = sourceTopology.GetFaceVertexCounts();
-            for (int faceId = 0; faceId < sourceFaceVertexCounts.size(); faceId++) {
+            for (size_t faceId = 0; faceId < sourceFaceVertexCounts.size(); faceId++) {
                 faceVertexCounts.push_back(sourceFaceVertexCounts[faceId]);
             }
 
             VtIntArray const& sourceFaceVertexIndices = sourceTopology.GetFaceVertexIndices();
-            for (int faceVertexId = 0; faceVertexId < sourceFaceVertexIndices.size();
+            for (size_t faceVertexId = 0; faceVertexId < sourceFaceVertexIndices.size();
                  faceVertexId++) {
                 faceVertexIndices.push_back(
                     sourceFaceVertexIndices[faceVertexId] + consolidatedBufferVertexOffset);
             }
 
             VtIntArray const& sourceHoleIndices = sourceTopology.GetHoleIndices();
-            for (int faceId = 0; faceId < sourceHoleIndices.size(); faceId++) {
+            for (size_t faceId = 0; faceId < sourceHoleIndices.size(); faceId++) {
                 holeIndices.push_back(
                     sourceHoleIndices[faceId] + consolidatedBufferVertexOffset); // untested?
             }
 
-            for (int idx = 0; idx < sourceMeshSharedData->_renderingToSceneFaceVtxIds.size();
+            for (size_t idx = 0; idx < sourceMeshSharedData->_renderingToSceneFaceVtxIds.size();
                  idx++) {
                 _meshSharedData->_renderingToSceneFaceVtxIds.push_back(
                     sourceMeshSharedData->_renderingToSceneFaceVtxIds[idx]
@@ -310,7 +312,7 @@ void MeshViewportCompute::createConsolidatedTopology(TopologyAccessor getTopolog
                 _meshSharedData->_sceneToRenderingFaceVtxIds.push_back(-1);
             }
 
-            for (int idx = 0; idx < sourceMeshSharedData->_sceneToRenderingFaceVtxIds.size();
+            for (size_t idx = 0; idx < sourceMeshSharedData->_sceneToRenderingFaceVtxIds.size();
                  idx++) {
                 _meshSharedData->_sceneToRenderingFaceVtxIds.push_back(
                     sourceMeshSharedData->_sceneToRenderingFaceVtxIds[idx]
@@ -346,8 +348,6 @@ void MeshViewportCompute::createConsolidatedAdjacency()
         MProfiler::kColorD_L2,
         "MeshViewportCompute:createConsolidatedAdjacency");
 
-    bool isConsolidated = nullptr != _geometryIndexMapping.get();
-
     Hd_VertexAdjacencySharedPtr adjacency(new Hd_VertexAdjacency());
     HdBufferSourceSharedPtr     adjacencyComputation
         = adjacency->GetSharedAdjacencyBuilderComputation(&_meshSharedData->_topology);
@@ -708,7 +708,7 @@ void MeshViewportCompute::prepareAdjacencyBuffer()
     _adjacencyBufferGPU.reset(new MHWRender::MVertexBuffer(intArrayDesc));
     void* bufferData = _adjacencyBufferGPU->acquire(adjacencyBufferSize, true);
 
-    for (int i = 0; i < vertexDataSize; i += 2) {
+    for (size_t i = 0; i < vertexDataSize; i += 2) {
         vertexDataStart[i] += paddingSize;
     }
 

From 72df53843052fedb36bf6a505ee8ca4d47ef5ef0 Mon Sep 17 00:00:00 2001
From: krickw <william.krick@autodesk.com>
Date: Mon, 7 Dec 2020 10:12:42 -0500
Subject: [PATCH 4/5] Fix more unsigned-signed comparisons.

---
 lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp | 12 ++++++------
 lib/mayaUsd/render/vp2RenderDelegate/mesh.h   |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
index 35e9c590d2..3379042085 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.cpp
@@ -335,14 +335,14 @@ void HdVP2Mesh::_InitGPUCompute()
     // would also be nice to check the openGL version but renderer->drawAPIVersion() returns 4.
     // Compute was added in 4.3 so I don't have enough information to make the check
     if (renderer && renderer->drawAPIIsOpenGL()
-        && (TfGetenvInt("HDVP2_USE_GPU_NORMAL_COMPUTATION", 0) > 0))
-        _gpuNormalsComputeThreshold
-            = TfGetenvInt("HDVP2_GPU_NORMAL_COMPUTATION_MINIMUM_THRESHOLD", 8000);
-    else
-        _gpuNormalsComputeThreshold = INT_MAX;
+        && (TfGetenvInt("HDVP2_USE_GPU_NORMAL_COMPUTATION", 0) > 0)) {
+        int threshold = TfGetenvInt("HDVP2_GPU_NORMAL_COMPUTATION_MINIMUM_THRESHOLD", 8000);
+        _gpuNormalsComputeThreshold = threshold >= 0 ? (size_t)threshold : SIZE_MAX;
+    } else
+        _gpuNormalsComputeThreshold = SIZE_MAX;
 }
 
-int HdVP2Mesh::_gpuNormalsComputeThreshold = -1;
+size_t HdVP2Mesh::_gpuNormalsComputeThreshold = SIZE_MAX;
 //! \brief  Constructor
 #if defined(HD_API_VERSION) && HD_API_VERSION >= 36
 HdVP2Mesh::HdVP2Mesh(HdVP2RenderDelegate* delegate, const SdfPath& id)
diff --git a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
index d7196bcbcf..70bde70e68 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
+++ b/lib/mayaUsd/render/vp2RenderDelegate/mesh.h
@@ -185,7 +185,7 @@ class HdVP2Mesh final : public HdMesh
     //! makes the expressions using these variables much simpler
     bool _gpuNormalsEnabled { true }; //!< Use GPU Compute for normal calculation, only used when
                                       //!< HDVP2_ENABLE_GPU_COMPUTE is defined
-    static int _gpuNormalsComputeThreshold;
+    static size_t _gpuNormalsComputeThreshold;
 };
 
 PXR_NAMESPACE_CLOSE_SCOPE

From ad202091942e54b71eacf3dd40fe65716164cdb0 Mon Sep 17 00:00:00 2001
From: krickw <william.krick@autodesk.com>
Date: Thu, 17 Dec 2020 11:33:15 -0500
Subject: [PATCH 5/5] Add USD version check.

---
 lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
index 8b8575b7d8..5b1d0cfbc0 100644
--- a/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
+++ b/lib/mayaUsd/render/vp2RenderDelegate/meshViewportCompute.h
@@ -40,8 +40,10 @@
      * some objects draw with incorrect indexing
 */
 
-#if MAYA_API_VERSION >= 20210000 \
-    && !defined(OSMac_) // OSX doesn't have OpenGL 4.3 support necessary for compute
+// Maya 2020 is missing API necessary for compute support
+// OSX doesn't have OpenGL 4.3 support necessary for compute
+// USD before 20.08 doesn't include some OSD commits we rely on
+#if MAYA_API_VERSION >= 20210000 && !defined(OSMac_) && USD_VERSION_NUM > 2002
 #define HDVP2_ENABLE_GPU_COMPUTE
 #endif