From a3c3bf7edf96d8fe3473f9db5a7272e84467ca98 Mon Sep 17 00:00:00 2001
From: clach <clach@users.noreply.github.com>
Date: Fri, 12 Nov 2021 11:24:42 -0800
Subject: [PATCH] Adds HgiShaderFunctionComputeDesc, with member "localSize",
 to HgiShaderFunctionDesc, allowing client to specify the desired number of
 threads in the local compute work group for a computer shader.
 HgiShaderGenerator now handles the writing of this local size to the shader,
 if necessary.

The HgiComputeCmds::Dispatch() function for HgiGL and HgiVulkan will now use
the local size provided by the HgiShaderFunctionDesc to determine number of
work groups to dispatch. Calls to Dispatch() should specify the total size of
the compute work (and not the total work size / local work size), and the
backend will handle determining the number of work groups.

Also adds some limit checks on work group size and number of work groups based
on device limits.

See #1656

(Internal change: 2199330)
---
 pxr/imaging/hdSt/codeGen.cpp               |  3 --
 pxr/imaging/hdSt/domeLightComputations.cpp | 15 +++++--
 pxr/imaging/hdSt/flatNormals.cpp           |  9 +++-
 pxr/imaging/hdSt/glslProgram.cpp           | 27 ++++++++----
 pxr/imaging/hdSt/glslProgram.h             |  8 ++++
 pxr/imaging/hdSt/quadrangulate.cpp         |  5 +++
 pxr/imaging/hdSt/shaders/compute.glslfx    | 10 +++++
 pxr/imaging/hdSt/shaders/domeLight.glslfx  |  4 --
 pxr/imaging/hdSt/smoothNormals.cpp         | 11 +++--
 pxr/imaging/hdSt/subdivision.cpp           |  1 +
 pxr/imaging/hgi/shaderFunctionDesc.cpp     | 21 ++++++++++
 pxr/imaging/hgi/shaderFunctionDesc.h       | 37 ++++++++++++++++
 pxr/imaging/hgiGL/computeCmds.cpp          | 40 +++++++++++++++++-
 pxr/imaging/hgiGL/computeCmds.h            |  1 +
 pxr/imaging/hgiGL/shaderGenerator.cpp      | 49 ++++++++++++++++++++--
 pxr/imaging/hgiMetal/shaderGenerator.h     |  3 ++
 pxr/imaging/hgiMetal/shaderGenerator.mm    | 15 +++++++
 pxr/imaging/hgiVulkan/computeCmds.cpp      | 45 +++++++++++++++++++-
 pxr/imaging/hgiVulkan/computeCmds.h        |  1 +
 pxr/imaging/hgiVulkan/shaderGenerator.cpp  | 25 ++++++++++-
 pxr/imaging/hgiVulkan/shaderGenerator.h    |  1 +
 21 files changed, 300 insertions(+), 31 deletions(-)

diff --git a/pxr/imaging/hdSt/codeGen.cpp b/pxr/imaging/hdSt/codeGen.cpp
index 4a484c2ac8..1660d97f6a 100644
--- a/pxr/imaging/hdSt/codeGen.cpp
+++ b/pxr/imaging/hdSt/codeGen.cpp
@@ -1000,9 +1000,6 @@ HdSt_CodeGen::CompileComputeProgram(HdStResourceRegistry*const registry)
         _genCommon << "#extension GL_ARB_bindless_texture : require\n";
     }
 
-    // default workgroup size (must follow #extension directives)
-    _genCommon << "layout(local_size_x = 1, local_size_y = 1) in;\n";
-
     // Used in glslfx files to determine if it is using new/old
     // imaging system. It can also be used as API guards when
     // we need new versions of Storm shading. 
diff --git a/pxr/imaging/hdSt/domeLightComputations.cpp b/pxr/imaging/hdSt/domeLightComputations.cpp
index 4e1824da2c..773f052700 100644
--- a/pxr/imaging/hdSt/domeLightComputations.cpp
+++ b/pxr/imaging/hdSt/domeLightComputations.cpp
@@ -135,11 +135,21 @@ HdSt_DomeLightComputationGPU::Execute(
 
     HdStResourceRegistry* hdStResourceRegistry =
         static_cast<HdStResourceRegistry*>(resourceRegistry);
+
+    constexpr int localSize = 8;
+
     HdStGLSLProgramSharedPtr const computeProgram =
         HdStGLSLProgram::GetComputeProgram(
             HdStPackageDomeLightShader(), 
             _shaderToken,
-            static_cast<HdStResourceRegistry*>(resourceRegistry));
+            "",
+            static_cast<HdStResourceRegistry*>(resourceRegistry), 
+            [&] (HgiShaderFunctionDesc &computeDesc) {
+                computeDesc.debugName = _shaderToken.GetString();
+                computeDesc.shaderStage = HgiShaderStageCompute;
+                computeDesc.computeDescriptor.localSize = 
+                    GfVec3i(localSize, localSize, 1);
+            });
     if (!TF_VERIFY(computeProgram)) {
         return;
     }
@@ -165,7 +175,6 @@ HdSt_DomeLightComputationGPU::Execute(
     int height = downsize ? srcDim[1] / 2 : srcDim[1];
     
     // Make sure dimensions align with the local size used in the Compute Shader
-    constexpr int localSize = 8;
     width = _MakeMultipleOf(width, localSize);
     height = _MakeMultipleOf(height, localSize);
 
@@ -264,7 +273,7 @@ HdSt_DomeLightComputationGPU::Execute(
     }
 
     // Queue compute work
-    computeCmds->Dispatch(width / localSize, height / localSize);
+    computeCmds->Dispatch(width, height);
 
     computeCmds->PopDebugGroup();
 
diff --git a/pxr/imaging/hdSt/flatNormals.cpp b/pxr/imaging/hdSt/flatNormals.cpp
index 74814ec64f..8732c9eba0 100644
--- a/pxr/imaging/hdSt/flatNormals.cpp
+++ b/pxr/imaging/hdSt/flatNormals.cpp
@@ -241,6 +241,7 @@ HdSt_FlatNormalsComputationGPU::Execute(
         int indexStride;
         int pParamOffset;
         int pParamStride;
+        int primIndexEnd;
     } uniform;
 
     HdStResourceRegistry* hdStResourceRegistry =
@@ -250,6 +251,7 @@ HdSt_FlatNormalsComputationGPU::Execute(
           [&](HgiShaderFunctionDesc &computeDesc) {
             computeDesc.debugName = shaderToken.GetString();
             computeDesc.shaderStage = HgiShaderStageCompute;
+            computeDesc.computeDescriptor.localSize = GfVec3i(64, 1, 1);
 
             TfToken srcType;
             TfToken dstType;
@@ -290,7 +292,8 @@ HdSt_FlatNormalsComputationGPU::Execute(
                 "indexOffset",        // interleave offset
                 "indexStride",        // interleave stride
                 "pParamOffset",       // interleave offset
-                "pParamStride"        // interleave stride
+                "pParamStride",       // interleave stride
+                "primIndexEnd"
             };
             static_assert((sizeof(Uniform) / sizeof(int)) ==
                           (sizeof(params) / sizeof(params[0])), "");
@@ -340,6 +343,9 @@ HdSt_FlatNormalsComputationGPU::Execute(
         HdDataSizeOfType(HdGetComponentType(primitiveParam->GetTupleType().type));
     uniform.pParamOffset = primitiveParam->GetOffset() / pParamComponentSize;
     uniform.pParamStride = primitiveParam->GetStride() / pParamComponentSize;
+    
+    const int numPrims = topologyRange->GetNumElements();
+    uniform.primIndexEnd = numPrims;
 
     Hgi* hgi = hdStResourceRegistry->GetHgi();
 
@@ -395,7 +401,6 @@ HdSt_FlatNormalsComputationGPU::Execute(
         pipeline, BufferBinding_Uniforms, sizeof(uniform), &uniform);
 
     // Queue compute work
-    int numPrims = topologyRange->GetNumElements();
     computeCmds->Dispatch(numPrims, 1);
 
     computeCmds->PopDebugGroup();
diff --git a/pxr/imaging/hdSt/glslProgram.cpp b/pxr/imaging/hdSt/glslProgram.cpp
index 62f296cc6f..4e96d1e974 100644
--- a/pxr/imaging/hdSt/glslProgram.cpp
+++ b/pxr/imaging/hdSt/glslProgram.cpp
@@ -281,6 +281,8 @@ HdStGLSLProgram::CompileShader(
     HgiShaderFunctionDesc shaderFnDesc;
     shaderFnDesc.shaderCode = shaderSource.c_str();
     shaderFnDesc.shaderStage = stage;
+    // Default work group size
+    shaderFnDesc.computeDescriptor.localSize = GfVec3i(1, 1, 1);
     HgiShaderFunctionHandle shaderFn = hgi->CreateShaderFunction(shaderFnDesc);
 
     if (!_ValidateCompilation(shaderFn, shaderType, shaderSource, _debugID)) {
@@ -442,6 +444,21 @@ HdStGLSLProgram::GetComputeProgram(
     std::string const &defines,
     HdStResourceRegistry *resourceRegistry,
     PopulateDescriptorCallback populateDescriptor)
+{
+    return GetComputeProgram(HdStPackageComputeShader(),
+                             shaderToken,
+                             defines,
+                             resourceRegistry,
+                             populateDescriptor);
+}
+
+HdStGLSLProgramSharedPtr
+HdStGLSLProgram::GetComputeProgram(
+    TfToken const &shaderFileName,
+    TfToken const &shaderToken,
+    std::string const &defines,
+    HdStResourceRegistry *resourceRegistry,
+    PopulateDescriptorCallback populateDescriptor)
 {
     // Find the program from registry
     HdInstance<HdStGLSLProgramSharedPtr> programInstance =
@@ -449,8 +466,7 @@ HdStGLSLProgram::GetComputeProgram(
                         _ComputeHash(shaderToken, defines));
 
     if (programInstance.IsFirstInstance()) {
-        // if not exists, create new one
-        TfToken const &shaderFileName = HdStPackageComputeShader();
+        // If program does not exist, create new one
         const HioGlslfx glslfx(shaderFileName, HioGlslfxTokens->defVal);
         std::string errorString;
         if (!glslfx.IsValid(&errorString)){
@@ -462,14 +478,9 @@ HdStGLSLProgram::GetComputeProgram(
         Hgi *hgi = resourceRegistry->GetHgi();
 
         HgiShaderFunctionDesc computeDesc;
-        std::string sourceCode(
-            "layout(local_size_x=1, local_size_y=1, local_size_z=1) in;\n");
-
-        sourceCode += defines;
-
         populateDescriptor(computeDesc);
 
-        sourceCode += glslfx.GetSource(shaderToken);
+        const std::string sourceCode = defines + glslfx.GetSource(shaderToken);
         computeDesc.shaderCode = sourceCode.c_str();
 
         HgiShaderFunctionHandle computeFn =
diff --git a/pxr/imaging/hdSt/glslProgram.h b/pxr/imaging/hdSt/glslProgram.h
index f36bbe16ce..7a1ffa8327 100644
--- a/pxr/imaging/hdSt/glslProgram.h
+++ b/pxr/imaging/hdSt/glslProgram.h
@@ -95,6 +95,14 @@ class HdStGLSLProgram final
         HdStResourceRegistry *resourceRegistry,
         PopulateDescriptorCallback populateDescriptor);
 
+    HDST_API
+    static HdStGLSLProgramSharedPtr GetComputeProgram(
+        TfToken const &shaderFileName,
+        TfToken const &shaderToken,
+        std::string const &defines,
+        HdStResourceRegistry *resourceRegistry,
+        PopulateDescriptorCallback populateDescriptor);
+
     /// Returns the role of the GPU data in this resource.
     TfToken const & GetRole() const {return _role;}
 
diff --git a/pxr/imaging/hdSt/quadrangulate.cpp b/pxr/imaging/hdSt/quadrangulate.cpp
index 727410df31..6e15f6321a 100644
--- a/pxr/imaging/hdSt/quadrangulate.cpp
+++ b/pxr/imaging/hdSt/quadrangulate.cpp
@@ -512,6 +512,7 @@ HdSt_QuadrangulateComputationGPU::Execute(
         int primvarOffset;
         int primvarStride;
         int numComponents;
+        int indexEnd;
     } uniform;
 
     // select shader by datatype
@@ -527,6 +528,8 @@ HdSt_QuadrangulateComputationGPU::Execute(
           [&](HgiShaderFunctionDesc &computeDesc) {
             computeDesc.debugName = shaderToken.GetString();
             computeDesc.shaderStage = HgiShaderStageCompute;
+            computeDesc.computeDescriptor.localSize = GfVec3i(64, 1, 1);
+
             if (shaderToken == HdStGLSLProgramTokens->quadrangulateFloat) {
                 HgiShaderFunctionAddWritableBuffer(
                     &computeDesc, "primvar", HdStTokens->_float,
@@ -548,6 +551,7 @@ HdSt_QuadrangulateComputationGPU::Execute(
                 "primvarOffset",      // interleave offset
                 "primvarStride",      // interleave stride
                 "numComponents",      // interleave datasize
+                "indexEnd"
             };
             static_assert((sizeof(Uniform) / sizeof(int)) ==
                           (sizeof(params) / sizeof(params[0])), "");
@@ -599,6 +603,7 @@ HdSt_QuadrangulateComputationGPU::Execute(
         HdGetComponentCount(primvar->GetTupleType().type);
 
     int numNonQuads = (int)quadInfo->numVerts.size();
+    uniform.indexEnd = numNonQuads;
 
     Hgi* hgi = hdStResourceRegistry->GetHgi();
 
diff --git a/pxr/imaging/hdSt/shaders/compute.glslfx b/pxr/imaging/hdSt/shaders/compute.glslfx
index c8631d7805..5e26bfe762 100644
--- a/pxr/imaging/hdSt/shaders/compute.glslfx
+++ b/pxr/imaging/hdSt/shaders/compute.glslfx
@@ -201,6 +201,9 @@ int getNormalsIndex(int idx)
 void main()
 {
     int index = int(hd_GlobalInvocationID.x);
+    if (index >= indexEnd) {
+        return;
+    }
     
     int offIndex = index * 2 + adjacencyOffset;
     
@@ -264,6 +267,10 @@ vec3 computeNormalForPrimIndex(int primIndex);
 void main()
 {
     int primIndex = int(hd_GlobalInvocationID.x);
+    if (primIndex >= primIndexEnd) {
+        return;
+    }
+
     int pParam = primitiveParam[getPrimitiveParamIndex(primIndex)];
     int edgeFlag = getEdgeFlag(pParam);
     int faceIndex = getFaceIndex(pParam);
@@ -377,6 +384,9 @@ vec3 computeNormalForPrimIndex(int primIndex)
 void main()
 {
     int index = int(hd_GlobalInvocationID.x);
+    if (index >= indexEnd) {
+        return;
+    }
 
     int quadInfoIndex = index * quadInfoStride + quadInfoOffset;
     int numVert = quadInfo[quadInfoIndex];
diff --git a/pxr/imaging/hdSt/shaders/domeLight.glslfx b/pxr/imaging/hdSt/shaders/domeLight.glslfx
index 15080aae68..b655a854c9 100644
--- a/pxr/imaging/hdSt/shaders/domeLight.glslfx
+++ b/pxr/imaging/hdSt/shaders/domeLight.glslfx
@@ -117,8 +117,6 @@ vec3 ImportanceSample_GGX(vec2 Xi, float roughness, vec3 normal)
 --- --------------------------------------------------------------------------
 -- glsl DomeLight.Irradiance
 
-layout(local_size_x = 8, local_size_y = 8) in;
-
 const float deltaPhi = (2.0f * float(PI)) / 180.0f;
 const float deltaTheta = (0.5f * float(PI)) / 64.0f;
 
@@ -169,7 +167,6 @@ void main(void)
 --- --------------------------------------------------------------------------
 -- glsl DomeLight.Prefilter
 
-layout(local_size_x = 8, local_size_y = 8) in;
 layout(std140, binding=0) uniform Uniforms {
    float roughness;
 } uniforms;
@@ -234,7 +231,6 @@ void main(void)
 --- --------------------------------------------------------------------------
 -- glsl DomeLight.BRDF
 
-layout(local_size_x = 8, local_size_y = 8) in;
 uniform int sampleLevel = 0;
 
 float Geometry_SchlicksmithGGX(float dotNL, float dotNV, float roughness)
diff --git a/pxr/imaging/hdSt/smoothNormals.cpp b/pxr/imaging/hdSt/smoothNormals.cpp
index 6fd393b2c8..4d2e7e1150 100644
--- a/pxr/imaging/hdSt/smoothNormals.cpp
+++ b/pxr/imaging/hdSt/smoothNormals.cpp
@@ -178,6 +178,7 @@ HdSt_SmoothNormalsComputationGPU::Execute(
         int pointsStride;
         int normalsOffset;
         int normalsStride;
+        int indexEnd;
     } uniform;
 
     HdStResourceRegistry* hdStResourceRegistry =
@@ -187,6 +188,7 @@ HdSt_SmoothNormalsComputationGPU::Execute(
           [&](HgiShaderFunctionDesc &computeDesc) {
             computeDesc.debugName = shaderToken.GetString();
             computeDesc.shaderStage = HgiShaderStageCompute;
+            computeDesc.computeDescriptor.localSize = GfVec3i(64, 1, 1);
 
             TfToken srcType;
             TfToken dstType;
@@ -220,6 +222,7 @@ HdSt_SmoothNormalsComputationGPU::Execute(
                 "pointsStride",       // interleave stride
                 "normalsOffset",      // interleave offset
                 "normalsStride",      // interleave stride
+                "indexEnd"
             };
             static_assert((sizeof(Uniform) / sizeof(int)) ==
                           (sizeof(params) / sizeof(params[0])), "");
@@ -266,7 +269,6 @@ HdSt_SmoothNormalsComputationGPU::Execute(
         HdDataSizeOfType(HdGetComponentType(normals->GetTupleType().type));
     uniform.normalsOffset = normals->GetOffset() / normalComponentSize;
     uniform.normalsStride = normals->GetStride() / normalComponentSize;
-
     // The number of points is based off the size of the output,
     // However, the number of points in the adjacency table
     // is computed based off the largest vertex indexed from
@@ -274,10 +276,11 @@ HdSt_SmoothNormalsComputationGPU::Execute(
     //
     // Therefore, we need to clamp the number of points
     // to the number of entries in the adjancency table.
-    int numDestPoints = range->GetNumElements();
-    int numSrcPoints = _adjacency->GetNumPoints();
+    const int numDestPoints = range->GetNumElements();
+    const int numSrcPoints = _adjacency->GetNumPoints();
 
-    int numPoints = std::min(numSrcPoints, numDestPoints);
+    const int numPoints = std::min(numSrcPoints, numDestPoints);
+    uniform.indexEnd = numPoints;
 
     Hgi* hgi = hdStResourceRegistry->GetHgi();
 
diff --git a/pxr/imaging/hdSt/subdivision.cpp b/pxr/imaging/hdSt/subdivision.cpp
index 0db3aaf846..db7fa5b317 100644
--- a/pxr/imaging/hdSt/subdivision.cpp
+++ b/pxr/imaging/hdSt/subdivision.cpp
@@ -1840,6 +1840,7 @@ _EvalStencilsGPU(
           [&](HgiShaderFunctionDesc &computeDesc) {
             computeDesc.debugName = shaderToken.GetString();
             computeDesc.shaderStage = HgiShaderStageCompute;
+            computeDesc.computeDescriptor.localSize = GfVec3i(64, 1, 1);
 
             HgiShaderFunctionAddBuffer(&computeDesc,
                 "sizes", HdStTokens->_int,
diff --git a/pxr/imaging/hgi/shaderFunctionDesc.cpp b/pxr/imaging/hgi/shaderFunctionDesc.cpp
index a8cfb85217..df5c5e0203 100644
--- a/pxr/imaging/hgi/shaderFunctionDesc.cpp
+++ b/pxr/imaging/hgi/shaderFunctionDesc.cpp
@@ -41,6 +41,11 @@ HgiShaderFunctionBufferDesc::HgiShaderFunctionBufferDesc()
 
 HgiShaderFunctionParamDesc::HgiShaderFunctionParamDesc() = default;
 
+HgiShaderFunctionComputeDesc::HgiShaderFunctionComputeDesc()
+    : localSize(GfVec3i(0, 0, 0))
+{ 
+}
+
 HgiShaderFunctionTessellationDesc::HgiShaderFunctionTessellationDesc()
 = default;
 
@@ -52,6 +57,7 @@ HgiShaderFunctionDesc::HgiShaderFunctionDesc()
   , stageInputs()
   , stageOutputs()
   , tessellationDescriptor()
+  , computeDescriptor()
 {
 }
 
@@ -108,6 +114,20 @@ bool operator!=(
     return !(lhs == rhs);
 }
 
+bool operator==(
+        const HgiShaderFunctionComputeDesc& lhs,
+        const HgiShaderFunctionComputeDesc& rhs)
+{
+    return lhs.localSize == rhs.localSize;
+}
+
+bool operator!=(
+    const HgiShaderFunctionComputeDesc& lhs,
+    const HgiShaderFunctionComputeDesc& rhs)
+{
+    return !(lhs == rhs);
+}
+
 bool operator==(
         const HgiShaderFunctionTessellationDesc& lhs,
         const HgiShaderFunctionTessellationDesc& rhs)
@@ -136,6 +156,7 @@ bool operator==(
            lhs.constantParams == rhs.constantParams &&
            lhs.stageInputs == rhs.stageInputs &&
            lhs.stageOutputs == rhs.stageOutputs &&
+           lhs.computeDescriptor == rhs.computeDescriptor;
            lhs.tessellationDescriptor == rhs.tessellationDescriptor;
 }
 
diff --git a/pxr/imaging/hgi/shaderFunctionDesc.h b/pxr/imaging/hgi/shaderFunctionDesc.h
index cf5b3d220b..693601fab2 100644
--- a/pxr/imaging/hgi/shaderFunctionDesc.h
+++ b/pxr/imaging/hgi/shaderFunctionDesc.h
@@ -160,6 +160,40 @@ bool operator!=(
     const HgiShaderFunctionParamDesc& lhs,
     const HgiShaderFunctionParamDesc& rhs);
 
+/// \struct HgiShaderFunctionComputeDesc
+///
+/// Describes a compute function's description
+///
+/// <ul>
+/// <li>localSize:
+///   Optional. Specifices the 3D size of the local thread grouping. Defaults to
+///   0, meaning it is not set. When x > 0, y and z must also be set > 0. 
+///   When localSize is set to > 0, the following source is generated:
+///   GLSL: layout(local_size_x = localSize[0],
+///      local_size_y = localSize[1], local_size_z = localSize[2]) in;
+///   MSL: [[max_total_threads_per_threadgroup(localSize[0] * 
+///      localSize[1] * localSize[w])]]
+/// </li>
+/// </ul>
+///
+struct HgiShaderFunctionComputeDesc
+{
+    HGI_API
+    HgiShaderFunctionComputeDesc();
+
+    GfVec3i localSize;
+};
+
+HGI_API
+bool operator==(
+        const HgiShaderFunctionComputeDesc& lhs,
+        const HgiShaderFunctionComputeDesc& rhs);
+
+HGI_API
+bool operator!=(
+        const HgiShaderFunctionComputeDesc& lhs,
+        const HgiShaderFunctionComputeDesc& rhs);
+
 /// \struct HgiShaderFunctionTessellationDesc
 ///
 /// Describes a tessellation function's description
@@ -217,6 +251,8 @@ bool operator!=(
 ///   List of descriptions of the outputs of the shader.</li>
 /// <li>tessellationDesc:
 ///   Description of tessellation shader function.</li>
+/// <li>computeDescriptor:
+///   Description of compute shader function.</li>
 /// </ul>
 ///
 struct HgiShaderFunctionDesc
@@ -232,6 +268,7 @@ struct HgiShaderFunctionDesc
     std::vector<HgiShaderFunctionParamDesc> stageInputs;
     std::vector<HgiShaderFunctionParamDesc> stageOutputs;
     HgiShaderFunctionTessellationDesc tessellationDescriptor;
+    HgiShaderFunctionComputeDesc computeDescriptor;
 };
 
 using HgiShaderFunctionDescVector =
diff --git a/pxr/imaging/hgiGL/computeCmds.cpp b/pxr/imaging/hgiGL/computeCmds.cpp
index aa66696dbf..a4a5dde64e 100644
--- a/pxr/imaging/hgiGL/computeCmds.cpp
+++ b/pxr/imaging/hgiGL/computeCmds.cpp
@@ -37,6 +37,7 @@ PXR_NAMESPACE_OPEN_SCOPE
 HgiGLComputeCmds::HgiGLComputeCmds(HgiGLDevice* device)
     : HgiComputeCmds()
     , _pushStack(0)
+    , _localWorkGroupSize(GfVec3i(1, 1, 1))
 {
 }
 
@@ -46,6 +47,22 @@ void
 HgiGLComputeCmds::BindPipeline(HgiComputePipelineHandle pipeline)
 {
     _ops.push_back( HgiGLOps::BindPipeline(pipeline) );
+
+    // Get and store local work group size from shader function desc
+    const HgiShaderFunctionHandleVector shaderFunctionsHandles = 
+        pipeline.Get()->GetDescriptor().shaderProgram.Get()->GetDescriptor().
+            shaderFunctions;
+
+    for (const auto &handle : shaderFunctionsHandles) {
+        const HgiShaderFunctionDesc &shaderDesc = handle.Get()->GetDescriptor();
+        if (shaderDesc.shaderStage == HgiShaderStageCompute) {
+            if (shaderDesc.computeDescriptor.localSize[0] > 0 && 
+                shaderDesc.computeDescriptor.localSize[1] > 0 &&
+                shaderDesc.computeDescriptor.localSize[2] > 0) {
+                _localWorkGroupSize = shaderDesc.computeDescriptor.localSize;
+            }
+        }
+    }
 }
 
 void
@@ -73,8 +90,29 @@ HgiGLComputeCmds::SetConstantValues(
 void
 HgiGLComputeCmds::Dispatch(int dimX, int dimY)
 {
+    const int threadsPerGroupX = _localWorkGroupSize[0];
+    const int threadsPerGroupY = _localWorkGroupSize[1];
+    int numWorkGroupsX = (dimX + (threadsPerGroupX - 1)) / threadsPerGroupX;
+    int numWorkGroupsY = (dimY + (threadsPerGroupY - 1)) / threadsPerGroupY;
+
+    // Determine device's num compute work group limits
+    int maxNumWorkGroups[2] = { 0, 0 };
+    glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &maxNumWorkGroups[0]);
+    glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &maxNumWorkGroups[1]);
+
+    if (numWorkGroupsX > maxNumWorkGroups[0]) {
+        TF_WARN("Max number of work group available from device is %i, larger "
+                "than %i", maxNumWorkGroups[0], numWorkGroupsX);
+        numWorkGroupsX = maxNumWorkGroups[0];
+    }
+    if (numWorkGroupsY > maxNumWorkGroups[1]) {
+        TF_WARN("Max number of work group available from device is %i, larger "
+                "than %i", maxNumWorkGroups[1], numWorkGroupsY);
+        numWorkGroupsY = maxNumWorkGroups[1];
+    }
+
     _ops.push_back(
-        HgiGLOps::Dispatch(dimX, dimY)
+        HgiGLOps::Dispatch(numWorkGroupsX, numWorkGroupsY)
         );
 }
 
diff --git a/pxr/imaging/hgiGL/computeCmds.h b/pxr/imaging/hgiGL/computeCmds.h
index a6d609dfe5..e2554fb005 100644
--- a/pxr/imaging/hgiGL/computeCmds.h
+++ b/pxr/imaging/hgiGL/computeCmds.h
@@ -84,6 +84,7 @@ class HgiGLComputeCmds final : public HgiComputeCmds
 
     HgiGLOpsVector _ops;
     int _pushStack;
+    GfVec3i _localWorkGroupSize;
 
     // Cmds is used only one frame so storing multi-frame state on will not
     // survive.
diff --git a/pxr/imaging/hgiGL/shaderGenerator.cpp b/pxr/imaging/hgiGL/shaderGenerator.cpp
index 4c5a516073..9b5b9f0c8a 100644
--- a/pxr/imaging/hgiGL/shaderGenerator.cpp
+++ b/pxr/imaging/hgiGL/shaderGenerator.cpp
@@ -21,6 +21,7 @@
 // KIND, either express or implied. See the Apache License for the specific
 // language governing permissions and limitations under the Apache License.
 //
+#include "pxr/imaging/garch/glApi.h"
 
 #include "pxr/imaging/hgiGL/shaderGenerator.h"
 #include "pxr/imaging/hgi/tokens.h"
@@ -47,11 +48,53 @@ HgiGLShaderGenerator::HgiGLShaderGenerator(
   : HgiShaderGenerator(descriptor)
   , _version(version)
 {
-    //Write out all GL shaders and add to shader sections
+    // Write out all GL shaders and add to shader sections
     GetShaderSections()->push_back(
         std::make_unique<HgiGLMacroShaderSection>(
             _GetMacroBlob(), ""));
 
+    if (descriptor.shaderStage == HgiShaderStageCompute) {
+
+        int workSizeX = descriptor.computeDescriptor.localSize[0];
+        int workSizeY = descriptor.computeDescriptor.localSize[1];
+        int workSizeZ = descriptor.computeDescriptor.localSize[2];
+
+        if (workSizeX == 0 || workSizeY == 0 || workSizeZ == 0) {
+            workSizeX = 1;
+            workSizeY = 1;
+            workSizeZ = 1;
+        }
+
+        // Determine device's compute work group local size limits
+        int maxLocalSize[3] = { 0, 0, 0 };
+        glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &maxLocalSize[0]);
+        glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &maxLocalSize[1]);
+        glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &maxLocalSize[2]);
+
+        if (workSizeX > maxLocalSize[0]) {
+            TF_WARN("Max size of compute work group available from device is "
+                    "%i, larger than %i", maxLocalSize[0], workSizeX);
+            workSizeX = maxLocalSize[0];
+        }
+        if (workSizeY > maxLocalSize[1]) {
+            TF_WARN("Max size of compute work group available from device is "
+                    "%i, larger than %i", maxLocalSize[1], workSizeY);
+            workSizeY = maxLocalSize[1];
+        }
+        if (workSizeZ > maxLocalSize[2]) {
+            TF_WARN("Max size of compute work group available from device is "
+                    "%i, larger than %i", maxLocalSize[2], workSizeZ);
+            workSizeZ = maxLocalSize[2];
+        }
+      
+        _shaderLayoutAttributes.push_back(
+            std::string("layout(") +
+            "local_size_x = " + std::to_string(workSizeX) + ", "
+            "local_size_y = " + std::to_string(workSizeY) + ", "
+            "local_size_z = " + std::to_string(workSizeZ) + ") in;\n"
+        );
+    }
+
     _WriteTextures(descriptor.textures);
     _WriteBuffers(descriptor.buffers);
     _WriteInOuts(descriptor.stageInputs, "in");
@@ -175,9 +218,9 @@ HgiGLShaderGenerator::_Execute(
     const std::string &originalShaderShader) 
 {
     // Version number must be first line in glsl shader
-    ss << _version << " \n";
+    ss << _version << "\n";
 
-    for (const std::string attr : _shaderLayoutAttributes) {
+    for (const std::string &attr : _shaderLayoutAttributes) {
         ss << attr;
     }
 
diff --git a/pxr/imaging/hgiMetal/shaderGenerator.h b/pxr/imaging/hgiMetal/shaderGenerator.h
index 390455fefd..9d86f20af0 100644
--- a/pxr/imaging/hgiMetal/shaderGenerator.h
+++ b/pxr/imaging/hgiMetal/shaderGenerator.h
@@ -32,6 +32,8 @@
 #include "pxr/imaging/hgiMetal/shaderSection.h"
 #include "pxr/imaging/hgi/shaderGenerator.h"
 
+#include "pxr/base/gf/vec3i.h"
+
 PXR_NAMESPACE_OPEN_SCOPE
 
 //Shader program structure
@@ -90,6 +92,7 @@ class HgiMetalShaderGenerator final: public HgiShaderGenerator
 
     HgiMetalShaderSectionUniquePtrVector _shaderSections;
     HgiMetalShaderStageEntryPointUniquePtr _generatorShaderSections;
+    GfVec3i _computeThreadGroupSize;
 };
 
 
diff --git a/pxr/imaging/hgiMetal/shaderGenerator.mm b/pxr/imaging/hgiMetal/shaderGenerator.mm
index 2580b06b67..b5d99f693c 100644
--- a/pxr/imaging/hgiMetal/shaderGenerator.mm
+++ b/pxr/imaging/hgiMetal/shaderGenerator.mm
@@ -920,10 +920,15 @@ void _Init(
     id<MTLDevice> device)
   : HgiShaderGenerator(descriptor)
   , _generatorShaderSections(_BuildShaderStageEntryPoints(descriptor))
+  , _computeThreadGroupSize(GfVec3i(0))
 {
     CreateShaderSection<HgiMetalMacroShaderSection>(
         _GetHeader(device),
         "Headers");
+
+    if (descriptor.shaderStage == HgiShaderStageCompute) {
+        _computeThreadGroupSize = descriptor.computeDescriptor.localSize;
+    }
 }
 
 HgiMetalShaderGenerator::~HgiMetalShaderGenerator() = default;
@@ -978,6 +983,16 @@ void _Init(
         //handle compute
         returnSS << "void";
     }
+
+    if (_computeThreadGroupSize[0] > 0 &&
+        _computeThreadGroupSize[1] > 0 &&
+        _computeThreadGroupSize[2] > 0) {
+        ss << "[[max_total_threads_per_threadgroup("
+           << _computeThreadGroupSize[0] << " * "
+           << _computeThreadGroupSize[1] << " * "
+           << _computeThreadGroupSize[2] << ")]]\n";
+    }
+
     ss << _generatorShaderSections->GetEntryPointStageName();
     ss << " " << returnSS.str() << " "
        << _generatorShaderSections->GetEntryPointFunctionName() << "(\n";
diff --git a/pxr/imaging/hgiVulkan/computeCmds.cpp b/pxr/imaging/hgiVulkan/computeCmds.cpp
index 675115b9d5..dc2039b02a 100755
--- a/pxr/imaging/hgiVulkan/computeCmds.cpp
+++ b/pxr/imaging/hgiVulkan/computeCmds.cpp
@@ -41,6 +41,7 @@ HgiVulkanComputeCmds::HgiVulkanComputeCmds(HgiVulkan* hgi)
     , _pushConstantsDirty(false)
     , _pushConstants(nullptr)
     , _pushConstantsByteSize(0)
+    , _localWorkGroupSize(GfVec3i(1, 1, 1))
 {
 }
 
@@ -75,6 +76,22 @@ HgiVulkanComputeCmds::BindPipeline(HgiComputePipelineHandle pipeline)
         _pipelineLayout = pso->GetVulkanPipelineLayout();
         pso->BindPipeline(_commandBuffer->GetVulkanCommandBuffer());
     }
+
+    // Get and store local work group size from shader function desc
+    const HgiShaderFunctionHandleVector shaderFunctionsHandles = 
+        pipeline.Get()->GetDescriptor().shaderProgram.Get()->GetDescriptor().
+            shaderFunctions;
+
+    for (const auto &handle : shaderFunctionsHandles) {
+        const HgiShaderFunctionDesc &shaderDesc = handle.Get()->GetDescriptor();
+        if (shaderDesc.shaderStage == HgiShaderStageCompute) {
+            if (shaderDesc.computeDescriptor.localSize[0] > 0 && 
+                shaderDesc.computeDescriptor.localSize[1] > 0 &&
+                shaderDesc.computeDescriptor.localSize[2] > 0) {
+                _localWorkGroupSize = shaderDesc.computeDescriptor.localSize;
+            }
+        }
+    }
 }
 
 void
@@ -109,10 +126,34 @@ HgiVulkanComputeCmds::Dispatch(int dimX, int dimY)
     _CreateCommandBuffer();
     _BindResources();
 
+    const int threadsPerGroupX = _localWorkGroupSize[0];
+    const int threadsPerGroupY = _localWorkGroupSize[1];
+    int numWorkGroupsX = (dimX + (threadsPerGroupX - 1)) / threadsPerGroupX;
+    int numWorkGroupsY = (dimY + (threadsPerGroupY - 1)) / threadsPerGroupY;
+
+    // Determine device's num compute work group limits
+    const VkPhysicalDeviceLimits limits = 
+        _hgi->GetCapabilities()->vkDeviceProperties.limits;
+    const GfVec3i maxNumWorkGroups = GfVec3i(
+        limits.maxComputeWorkGroupCount[0],
+        limits.maxComputeWorkGroupCount[1],
+        limits.maxComputeWorkGroupCount[2]);
+
+    if (numWorkGroupsX > maxNumWorkGroups[0]) {
+        TF_WARN("Max number of work group available from device is %i, larger "
+                "than %i", maxNumWorkGroups[0], numWorkGroupsX);
+        numWorkGroupsX = maxNumWorkGroups[0];
+    }
+    if (numWorkGroupsY > maxNumWorkGroups[1]) {
+        TF_WARN("Max number of work group available from device is %i, larger "
+                "than %i", maxNumWorkGroups[1], numWorkGroupsY);
+        numWorkGroupsY = maxNumWorkGroups[1];
+    }
+
     vkCmdDispatch(
         _commandBuffer->GetVulkanCommandBuffer(),
-        (uint32_t) dimX,
-        (uint32_t) dimY,
+        (uint32_t) numWorkGroupsX,
+        (uint32_t) numWorkGroupsY,
         1);
 }
 
diff --git a/pxr/imaging/hgiVulkan/computeCmds.h b/pxr/imaging/hgiVulkan/computeCmds.h
index 3cf7cd2d57..ab7332f245 100644
--- a/pxr/imaging/hgiVulkan/computeCmds.h
+++ b/pxr/imaging/hgiVulkan/computeCmds.h
@@ -95,6 +95,7 @@ class HgiVulkanComputeCmds final : public HgiComputeCmds
     bool _pushConstantsDirty;
     uint8_t* _pushConstants;
     uint32_t _pushConstantsByteSize;
+    GfVec3i _localWorkGroupSize;
 
     // Cmds is used only one frame so storing multi-frame state on will not
     // survive.
diff --git a/pxr/imaging/hgiVulkan/shaderGenerator.cpp b/pxr/imaging/hgiVulkan/shaderGenerator.cpp
index 4ecf600b95..9a463523e1 100644
--- a/pxr/imaging/hgiVulkan/shaderGenerator.cpp
+++ b/pxr/imaging/hgiVulkan/shaderGenerator.cpp
@@ -48,10 +48,29 @@ HgiVulkanShaderGenerator::HgiVulkanShaderGenerator(
   , _bindIndex(0)
   , _version(version)
 {
-    //Write out all GL shaders and add to shader sections
+    // Write out all GL shaders and add to shader sections
     GetShaderSections()->push_back(
         std::make_unique<HgiVulkanMacroShaderSection>(_GetMacroBlob(), ""));
 
+    if (descriptor.shaderStage == HgiShaderStageCompute) {
+        int workSizeX = descriptor.computeDescriptor.localSize[0];
+        int workSizeY = descriptor.computeDescriptor.localSize[1];
+        int workSizeZ = descriptor.computeDescriptor.localSize[2];
+
+        if (workSizeX == 0 || workSizeY == 0 || workSizeZ == 0) {
+            workSizeX = 1;
+            workSizeY = 1;
+            workSizeZ = 1;
+        }
+      
+        _shaderLayoutAttributes.push_back(
+            std::string("layout(") +
+            "local_size_x = " + std::to_string(workSizeX) + ", "
+            "local_size_y = " + std::to_string(workSizeY) + ", "
+            "local_size_z = " + std::to_string(workSizeZ) + ") in;\n"
+        );
+    }
+
     // The ordering here is important (buffers before textures), because we
     // need to increment the bind location for resources in the same order
     // as HgiVulkanResourceBindings.
@@ -184,6 +203,10 @@ HgiVulkanShaderGenerator::_Execute(
 {
     // Version number must be first line in glsl shader
     ss << _version << " \n";
+    
+    for (const std::string &attr : _shaderLayoutAttributes) {
+        ss << attr;
+    }
 
     HgiVulkanShaderSectionUniquePtrVector* shaderSections = GetShaderSections();
     //For all shader sections, visit the areas defined for all
diff --git a/pxr/imaging/hgiVulkan/shaderGenerator.h b/pxr/imaging/hgiVulkan/shaderGenerator.h
index 794a581872..5428b49e23 100644
--- a/pxr/imaging/hgiVulkan/shaderGenerator.h
+++ b/pxr/imaging/hgiVulkan/shaderGenerator.h
@@ -74,6 +74,7 @@ class HgiVulkanShaderGenerator final: public HgiShaderGenerator
     
     HgiVulkanShaderSectionUniquePtrVector _shaderSections;
     uint32_t _bindIndex;
+    std::vector<std::string> _shaderLayoutAttributes;
     std::string _version;
 };