From bec5c82137c696a2411d9b643804cfe3acc163d9 Mon Sep 17 00:00:00 2001
From: Sylvain Doremus <dragonjoker59@hotmail.com>
Date: Mon, 29 May 2023 00:08:55 +0200
Subject: [PATCH] Clusters: Added support for lights morton codes sorting.

---
 .github/workflows/cmake.yml                   |   5 +-
 data/vcpkg/ports/rendergraph/portfile.cmake   |   8 +-
 data/vcpkg/ports/shaderwriter/portfile.cmake  |   8 +-
 external/RenderGraph                          |   2 +-
 external/ShaderWriter                         |   2 +-
 external/vcpkg                                |   2 +-
 .../Core/Castor3D/Buffer/GpuBufferOffset.hpp  |  30 +
 .../Render/Clustered/BuildLightsBVH.hpp       |   2 +-
 .../Render/Clustered/ClusteredModule.hpp      |   2 +
 .../Render/Clustered/FrustumClusters.hpp      |  44 +-
 .../Render/Clustered/SortLightsMortonCode.hpp |  18 +
 .../Core/CastorUtils/Design/DataHolder.hpp    |   2 +-
 .../Core/CastorUtils/Exception/Assertion.hpp  |   1 -
 source/Core/Castor3D/Buffer/GpuBufferPool.cpp | 107 +-
 source/Core/Castor3D/CMakeLists.txt           |   2 +
 source/Core/Castor3D/DebugDefines.hpp         |   1 +
 .../Clustered/AssignLightsToClusters.cpp      |  41 +-
 .../Render/Clustered/BuildLightsBVH.cpp       |  90 +-
 .../Render/Clustered/ComputeClustersAABB.cpp  |   4 +-
 .../Clustered/ComputeLightsMortonCode.cpp     |  36 +-
 .../Render/Clustered/FrustumClusters.cpp      |  55 +-
 .../Render/Clustered/ReduceLightsAABB.cpp     |  10 +-
 .../Render/Clustered/SortLightsMortonCode.cpp | 970 ++++++++++++++++++
 .../VoxelConeTracing/Voxelizer.cpp            |   2 +-
 .../Lighting/ClusteredLightsPipeline.cpp      |   2 +-
 .../Render/Opaque/Lighting/LightsPipeline.cpp |   2 +-
 .../Render/Opaque/VisibilityReorderPass.cpp   |   2 +-
 source/Core/Castor3D/Render/RenderTarget.cpp  |   1 +
 .../Core/Castor3D/Scene/Light/SpotLight.cpp   |   2 +-
 .../CastorUtils/CastorUtilsPrerequisites.cpp  |   1 -
 tools/CastorTestLauncher/MainFrame.cpp        |   2 +-
 tools/CastorViewer/MainFrame.cpp              |  14 +-
 vcpkg.json                                    |   3 +-
 33 files changed, 1386 insertions(+), 87 deletions(-)
 create mode 100644 include/Core/Castor3D/Render/Clustered/SortLightsMortonCode.hpp
 create mode 100644 source/Core/Castor3D/Render/Clustered/SortLightsMortonCode.cpp

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 1c8fc0b3fd..6f82b4e01d 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -25,6 +25,9 @@ jobs:
     - name: Checkout submodules
       run: |
         git submodule update --init -- "CMake"
+        git submodule update --init --recursive -- "external/Ashes"
+        git submodule update --init --recursive -- "external/ShaderWriter"
+        git submodule update --init --recursive -- "external/RenderGraph"
         git submodule update --init -- "external/vcpkg"
     - name: Setup vcpkg
       uses: lukka/run-vcpkg@v10
@@ -51,7 +54,7 @@ jobs:
       shell: bash
       working-directory: ${{runner.workspace}}/build-${{ matrix.buildType }}
       run: |
-        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.buildType }} -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/external/vcpkg/scripts/buildsystems/vcpkg.cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/package/Castor3D -DPROJECTS_WARNINGS_AS_ERRORS=$PROJ_WAE -DPROJECTS_UNITY_BUILD=ON -DPROJECTS_USE_PRECOMPILED_HEADERS=OFF -DSDW_GENERATE_SOURCE=OFF -DCASTOR_BUILD_PLUGINS=ON -DCASTOR_BUILDGRP_INTEROP=OFF -DCASTOR_BUILDGRP_TEST=OFF -DCASTOR_USE_GLSLANG=ON -DCASTOR_DISABLE_DELAYED_INITIALISATION=ON -DASHES_BUILD_RENDERER_OGL=OFF -DCASTOR_BUILDGRP_SAMPLE=OFF -DCASTOR_BUILDGRP_TOOL=OFF -DCASTOR_FORCE_VCPKG_SUBMODULES=ON
+        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.buildType }} -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/external/vcpkg/scripts/buildsystems/vcpkg.cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/package/Castor3D -DPROJECTS_WARNINGS_AS_ERRORS=$PROJ_WAE -DPROJECTS_UNITY_BUILD=ON -DPROJECTS_USE_PRECOMPILED_HEADERS=OFF -DSDW_GENERATE_SOURCE=OFF -DCASTOR_BUILD_PLUGINS=ON -DCASTOR_BUILDGRP_INTEROP=OFF -DCASTOR_BUILDGRP_TEST=OFF -DCASTOR_USE_GLSLANG=ON -DCASTOR_DISABLE_DELAYED_INITIALISATION=ON -DASHES_BUILD_RENDERER_OGL=OFF -DCASTOR_BUILDGRP_SAMPLE=OFF -DCASTOR_BUILDGRP_TOOL=OFF
     - name: Build
       if: steps.configure.conclusion == 'success'
       id: build
diff --git a/data/vcpkg/ports/rendergraph/portfile.cmake b/data/vcpkg/ports/rendergraph/portfile.cmake
index e8818301af..3f722d50a8 100644
--- a/data/vcpkg/ports/rendergraph/portfile.cmake
+++ b/data/vcpkg/ports/rendergraph/portfile.cmake
@@ -1,15 +1,15 @@
 vcpkg_from_github(OUT_SOURCE_PATH SOURCE_PATH
     REPO DragonJoker/RenderGraph
-    REF c5bdda793361901762d273b76f5a2355f1ab5411
+    REF e9f6b3243095d832f409e15fbafff4f73af6e401
     HEAD_REF master
-    SHA512 c9549ebb6109d6a3a5673b4180fe6f9491d14f8666714451b3bc6b0cd8b6d4a57c79d5122b4729af0e98580418ff0089700ec71c8882744a342e58b4d19be0e3
+    SHA512 3609da52a6f5c5c213fcc56416cfc714403d0bc80742540990cab2973b2ee3e41e9629b04f379db84c2194acff6cecf11c4e86d4167ca78aa8606a85cbcaa433
 )
 
 vcpkg_from_github(OUT_SOURCE_PATH CMAKE_SOURCE_PATH
     REPO DragonJoker/CMakeUtils
-    REF 89a4c8fd4f0a464403676b6b1f1c5d178f6255b3
+    REF 3818effff171f863d0c23e6fbbf79911f03cc6d3
     HEAD_REF master
-    SHA512 98c46a563f2e4a28d9c91f4f255c500118dd0c66a4422d42969bdcc16c1493581db45b709fc3167e1ced8fa628d51880cb459df0d6b3dc013b945f66597ec768
+    SHA512 3a13b371adf24f530fdef6005d3a9105185eca131c9b8aba68615a593f76d6aac2c3c7f0ecf6ce430e2b854afca4abd99f1400a1e6665478c9daa34a8dac6f5b
 )
 
 get_filename_component(SRC_PATH "${CMAKE_SOURCE_PATH}" DIRECTORY)
diff --git a/data/vcpkg/ports/shaderwriter/portfile.cmake b/data/vcpkg/ports/shaderwriter/portfile.cmake
index 1e09a154b0..aaf33ee3cf 100644
--- a/data/vcpkg/ports/shaderwriter/portfile.cmake
+++ b/data/vcpkg/ports/shaderwriter/portfile.cmake
@@ -1,15 +1,15 @@
 vcpkg_from_github(OUT_SOURCE_PATH SOURCE_PATH
     REPO DragonJoker/ShaderWriter
-    REF 42aa67fda5b2a72f21c0fcc9f51c2ed2838e1360
+    REF 6aa2e9bb7f51c80df3ebf5f3d7ea5936d87e671f
     HEAD_REF development
-    SHA512 37826bf50ae09e2e1eea68e21ee2dcf4461b56c3833cc0d19b4005411bfbc81e5bcce2586e63359c6b5226b92d3c71d4d9129319fa85df57013e0fd025538e60
+    SHA512 04c9973c513ff4b912283d4db93e255455e2ed909fa1ebfc09759d39e79bd7e156abf65f5686a8ffa311f7441b9c802c8a346ec41b02eb4e42564b53431042b8
 )
 
 vcpkg_from_github(OUT_SOURCE_PATH CMAKE_SOURCE_PATH
     REPO DragonJoker/CMakeUtils
-    REF 89a4c8fd4f0a464403676b6b1f1c5d178f6255b3
+    REF 3818effff171f863d0c23e6fbbf79911f03cc6d3
     HEAD_REF master
-    SHA512 98c46a563f2e4a28d9c91f4f255c500118dd0c66a4422d42969bdcc16c1493581db45b709fc3167e1ced8fa628d51880cb459df0d6b3dc013b945f66597ec768
+    SHA512 3a13b371adf24f530fdef6005d3a9105185eca131c9b8aba68615a593f76d6aac2c3c7f0ecf6ce430e2b854afca4abd99f1400a1e6665478c9daa34a8dac6f5b
 )
 
 file(REMOVE_RECURSE "${SOURCE_PATH}/CMake")
diff --git a/external/RenderGraph b/external/RenderGraph
index 31396b687f..e9f6b32430 160000
--- a/external/RenderGraph
+++ b/external/RenderGraph
@@ -1 +1 @@
-Subproject commit 31396b687fc090e9fc9a1840f1dc3593a6ccf31c
+Subproject commit e9f6b3243095d832f409e15fbafff4f73af6e401
diff --git a/external/ShaderWriter b/external/ShaderWriter
index 42aa67fda5..6aa2e9bb7f 160000
--- a/external/ShaderWriter
+++ b/external/ShaderWriter
@@ -1 +1 @@
-Subproject commit 42aa67fda5b2a72f21c0fcc9f51c2ed2838e1360
+Subproject commit 6aa2e9bb7f51c80df3ebf5f3d7ea5936d87e671f
diff --git a/external/vcpkg b/external/vcpkg
index 6accd15d64..1c5a340f6e 160000
--- a/external/vcpkg
+++ b/external/vcpkg
@@ -1 +1 @@
-Subproject commit 6accd15d644e93cec849ea346a147828437928b3
+Subproject commit 1c5a340f6e10985e2d92af174a68dbd15c1fa4e1
diff --git a/include/Core/Castor3D/Buffer/GpuBufferOffset.hpp b/include/Core/Castor3D/Buffer/GpuBufferOffset.hpp
index 0ceb99f98c..b2622e1ca8 100644
--- a/include/Core/Castor3D/Buffer/GpuBufferOffset.hpp
+++ b/include/Core/Castor3D/Buffer/GpuBufferOffset.hpp
@@ -10,6 +10,36 @@ See LICENSE file in root folder
 
 namespace castor3d
 {
+	C3D_API void createUniformPassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size );
+	C3D_API void createInputStoragePassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size );
+	C3D_API void createInOutStoragePassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size );
+	C3D_API void createOutputStoragePassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size );
+	C3D_API void createClearableOutputStorageBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size );
 	C3D_API void createUniformPassBinding( crg::FramePass & pass
 		, uint32_t binding
 		, std::string const & name
diff --git a/include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp b/include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp
index 3d8112b5af..2fe76545c8 100644
--- a/include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp
+++ b/include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp
@@ -9,7 +9,7 @@ See LICENSE file in root folder
 namespace castor3d
 {
 	C3D_API crg::FramePass const & createBuildLightsBVHPass( crg::FramePassGroup & graph
-		, crg::FramePass const * previousPass
+		, crg::FramePassArray const & previousPasses
 		, RenderDevice const & device
 		, CameraUbo const & cameraUbo
 		, FrustumClusters & clusters );
diff --git a/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp b/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp
index 8d4bda12c1..05aea91cd0 100644
--- a/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp
+++ b/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp
@@ -40,6 +40,8 @@ namespace castor3d
 	using OnClustersBuffersChanged = castor::SignalT< ClustersBuffersChangedFunction >;
 	using OnClustersBuffersChangedConnection = castor::ConnectionT< ClustersBuffersChangedFunction >;
 
+	C3D_API u32 getLightsMortonCodeChunkCount( u32 lightCount );
+
 	//@}
 	//@}
 }
diff --git a/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp b/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp
index 9d6fa718ab..abea8aed90 100644
--- a/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp
+++ b/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp
@@ -140,9 +140,24 @@ namespace castor3d
 			return *m_spotBVHBuffer;
 		}
 
-		void swapLightMortonIndicesIO()
+		void initPointLightMortonIndicesIO()noexcept
 		{
-			m_lightMortonIndicesInput = 1u - m_lightMortonIndicesInput;
+			m_pointLightMortonIndicesInput = 1u;
+		}
+
+		void initSpotLightMortonIndicesIO()noexcept
+		{
+			m_spotLightMortonIndicesInput = 1u;
+		}
+
+		void swapPointLightMortonIndicesIO()noexcept
+		{
+			m_pointLightMortonIndicesInput = 1u - m_pointLightMortonIndicesInput;
+		}
+
+		void swapSpotLightMortonIndicesIO()noexcept
+		{
+			m_spotLightMortonIndicesInput = 1u - m_spotLightMortonIndicesInput;
 		}
 
 		ashes::BufferBase & getPointLightIndicesBuffer( uint32_t index )const noexcept
@@ -167,42 +182,47 @@ namespace castor3d
 
 		ashes::BufferBase & getInputPointLightIndicesBuffer()const noexcept
 		{
-			return getPointLightIndicesBuffer( m_lightMortonIndicesInput );
+			return getPointLightIndicesBuffer( m_pointLightMortonIndicesInput );
 		}
 
 		ashes::BufferBase & getInputSpotLightIndicesBuffer()const noexcept
 		{
-			return getSpotLightIndicesBuffer( m_lightMortonIndicesInput );
+			return getSpotLightIndicesBuffer( m_spotLightMortonIndicesInput );
 		}
 
 		ashes::BufferBase & getOutputPointLightIndicesBuffer()const noexcept
 		{
-			return getPointLightIndicesBuffer( 1u - m_lightMortonIndicesInput );
+			return getPointLightIndicesBuffer( 1u - m_pointLightMortonIndicesInput );
 		}
 
 		ashes::BufferBase & getOutputSpotLightIndicesBuffer()const noexcept
 		{
-			return getSpotLightIndicesBuffer( 1u - m_lightMortonIndicesInput );
+			return getSpotLightIndicesBuffer( 1u - m_spotLightMortonIndicesInput );
 		}
 
 		ashes::BufferBase & getInputPointLightMortonCodesBuffer()const noexcept
 		{
-			return getPointLightMortonCodesBuffer( m_lightMortonIndicesInput );
+			return getPointLightMortonCodesBuffer( m_pointLightMortonIndicesInput );
 		}
 
 		ashes::BufferBase & getInputSpotLightMortonCodesBuffer()const noexcept
 		{
-			return getSpotLightMortonCodesBuffer( m_lightMortonIndicesInput );
+			return getSpotLightMortonCodesBuffer( m_spotLightMortonIndicesInput );
 		}
 
 		ashes::BufferBase & getOutputPointLightMortonCodesBuffer()const noexcept
 		{
-			return getPointLightMortonCodesBuffer( 1u - m_lightMortonIndicesInput );
+			return getPointLightMortonCodesBuffer( 1u - m_pointLightMortonIndicesInput );
 		}
 
 		ashes::BufferBase & getOutputSpotLightMortonCodesBuffer()const noexcept
 		{
-			return getSpotLightMortonCodesBuffer( 1u - m_lightMortonIndicesInput );
+			return getSpotLightMortonCodesBuffer( 1u - m_spotLightMortonIndicesInput );
+		}
+
+		ashes::BufferBase & getMergePathPartitionsBuffer()const noexcept
+		{
+			return *m_mergePathPartitionsBuffer;
 		}
 
 		auto & getCamera()const noexcept
@@ -229,7 +249,8 @@ namespace castor3d
 		bool m_clustersDirty{ true };
 		bool m_lightsDirty{ true };
 		bool m_first{ true };
-		uint32_t m_lightMortonIndicesInput{ 1u };
+		uint32_t m_pointLightMortonIndicesInput{ 1u };
+		uint32_t m_spotLightMortonIndicesInput{ 1u };
 		castor::GroupChangeTracked< castor::Point3ui > m_dimensions;
 		castor::GroupChangeTracked< uint32_t > m_clusterSize;
 		castor::GroupChangeTracked< castor::Matrix4x4f > m_cameraProjection;
@@ -242,6 +263,7 @@ namespace castor3d
 		ashes::BufferBasePtr m_pointLightClusterIndexBuffer;
 		ashes::BufferBasePtr m_spotLightClusterIndexBuffer;
 		ashes::BufferBasePtr m_lightsAABBBuffer;
+		ashes::BufferBasePtr m_mergePathPartitionsBuffer;
 		std::array< ashes::BufferBasePtr, 2u > m_pointMortonCodesBuffers;
 		std::array< ashes::BufferBasePtr, 2u > m_spotMortonCodesBuffers;
 		std::array< ashes::BufferBasePtr, 2u > m_pointIndicesBuffers;
diff --git a/include/Core/Castor3D/Render/Clustered/SortLightsMortonCode.hpp b/include/Core/Castor3D/Render/Clustered/SortLightsMortonCode.hpp
new file mode 100644
index 0000000000..63dc97f7c2
--- /dev/null
+++ b/include/Core/Castor3D/Render/Clustered/SortLightsMortonCode.hpp
@@ -0,0 +1,18 @@
+﻿/*
+See LICENSE file in root folder
+*/
+#ifndef ___C3D_SortLightsMortonCode_H___
+#define ___C3D_SortLightsMortonCode_H___
+
+#include "ClusteredModule.hpp"
+
+namespace castor3d
+{
+	C3D_API crg::FramePassArray createSortLightsMortonCodePass( crg::FramePassGroup & graph
+		, crg::FramePass const * previousPass
+		, RenderDevice const & device
+		, CameraUbo const & cameraUbo
+		, FrustumClusters & clusters );
+}
+
+#endif
diff --git a/include/Core/CastorUtils/Design/DataHolder.hpp b/include/Core/CastorUtils/Design/DataHolder.hpp
index 2d4e0d49af..e3ffab0a88 100644
--- a/include/Core/CastorUtils/Design/DataHolder.hpp
+++ b/include/Core/CastorUtils/Design/DataHolder.hpp
@@ -14,7 +14,7 @@ namespace castor
 	public:
 		DataHolderT()noexcept = default;
 
-		DataHolderT( Data d )noexcept
+		explicit DataHolderT( Data d )noexcept
 			: m_data{ std::move( d ) }
 		{
 		}
diff --git a/include/Core/CastorUtils/Exception/Assertion.hpp b/include/Core/CastorUtils/Exception/Assertion.hpp
index 7f54bf7d52..93e8bce396 100644
--- a/include/Core/CastorUtils/Exception/Assertion.hpp
+++ b/include/Core/CastorUtils/Exception/Assertion.hpp
@@ -11,7 +11,6 @@ See LICENSE file in root folder
 namespace castor
 {
 	CU_API void cuLogError( char const * const description );
-	[[ noreturn ]]
 	CU_API void cuFailure( char const * const description );
 }
 
diff --git a/source/Core/Castor3D/Buffer/GpuBufferPool.cpp b/source/Core/Castor3D/Buffer/GpuBufferPool.cpp
index 09feef5e5e..26eba01e93 100644
--- a/source/Core/Castor3D/Buffer/GpuBufferPool.cpp
+++ b/source/Core/Castor3D/Buffer/GpuBufferPool.cpp
@@ -15,6 +15,93 @@ namespace castor3d
 {
 	//*********************************************************************************************
 
+	namespace gpupol
+	{
+		static crg::VkBufferArray makeVkArray( std::vector< ashes::BufferBase const * > const & buffers )
+		{
+			crg::VkBufferArray result;
+
+			for ( auto buffer : buffers )
+			{
+				result.push_back( *buffer );
+			}
+
+			return result;
+		}
+	}
+
+	//*********************************************************************************************
+
+	void createUniformPassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size )
+	{
+		crg::VkBufferArray vkBuffers = gpupol::makeVkArray( buffers );
+		pass.addUniformBuffer( { vkBuffers, name }
+			, binding
+			, offset
+			, size );
+	}
+
+	void createInputStoragePassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size )
+	{
+		crg::VkBufferArray vkBuffers = gpupol::makeVkArray( buffers );
+		pass.addInputStorageBuffer( { vkBuffers, name }
+			, binding
+			, offset
+			, size );
+	}
+
+	void createInOutStoragePassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size )
+	{
+		crg::VkBufferArray vkBuffers = gpupol::makeVkArray( buffers );
+		pass.addInOutStorageBuffer( { vkBuffers, name }
+			, binding
+			, offset
+			, size );
+	}
+
+	void createOutputStoragePassBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size )
+	{
+		crg::VkBufferArray vkBuffers = gpupol::makeVkArray( buffers );
+		pass.addOutputStorageBuffer( { vkBuffers, name }
+			, binding
+			, offset
+			, size );
+	}
+
+	void createClearableOutputStorageBinding( crg::FramePass & pass
+		, uint32_t binding
+		, std::string const & name
+		, std::vector< ashes::BufferBase const * > buffers
+		, VkDeviceSize offset
+		, VkDeviceSize size )
+	{
+		crg::VkBufferArray vkBuffers = gpupol::makeVkArray( buffers );
+		pass.addClearableOutputStorageBuffer( { vkBuffers, name }
+			, binding
+			, offset
+			, size );
+	}
+
 	void createUniformPassBinding( crg::FramePass & pass
 		, uint32_t binding
 		, std::string const & name
@@ -22,8 +109,10 @@ namespace castor3d
 		, VkDeviceSize offset
 		, VkDeviceSize size )
 	{
-		pass.addUniformBuffer( { buffer, name }
+		createUniformPassBinding( pass
 			, binding
+			, name
+			, { &buffer }
 			, offset
 			, size );
 	}
@@ -35,8 +124,10 @@ namespace castor3d
 		, VkDeviceSize offset
 		, VkDeviceSize size )
 	{
-		pass.addInputStorageBuffer( { buffer, name }
+		createInputStoragePassBinding( pass
 			, binding
+			, name
+			, { &buffer }
 			, offset
 			, size );
 	}
@@ -48,8 +139,10 @@ namespace castor3d
 		, VkDeviceSize offset
 		, VkDeviceSize size )
 	{
-		pass.addInOutStorageBuffer( { buffer, name }
+		createInOutStoragePassBinding( pass
 			, binding
+			, name
+			, { &buffer }
 			, offset
 			, size );
 	}
@@ -61,8 +154,10 @@ namespace castor3d
 		, VkDeviceSize offset
 		, VkDeviceSize size )
 	{
-		pass.addOutputStorageBuffer( { buffer, name }
+		createOutputStoragePassBinding( pass
 			, binding
+			, name
+			, { &buffer }
 			, offset
 			, size );
 	}
@@ -74,8 +169,10 @@ namespace castor3d
 		, VkDeviceSize offset
 		, VkDeviceSize size )
 	{
-		pass.addClearableOutputStorageBuffer( { buffer, name }
+		createClearableOutputStorageBinding( pass
 			, binding
+			, name
+			, { &buffer }
 			, offset
 			, size );
 	}
diff --git a/source/Core/Castor3D/CMakeLists.txt b/source/Core/Castor3D/CMakeLists.txt
index e8fc40c6bc..07585c20ae 100644
--- a/source/Core/Castor3D/CMakeLists.txt
+++ b/source/Core/Castor3D/CMakeLists.txt
@@ -1138,6 +1138,7 @@ set( ${PROJECT_NAME}_FOLDER_SRC_FILES
 	${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/ComputeLightsMortonCode.cpp
 	${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/FrustumClusters.cpp
 	${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/ReduceLightsAABB.cpp
+	${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/SortLightsMortonCode.cpp
 )
 set( ${PROJECT_NAME}_FOLDER_HDR_FILES
 	${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/AssignLightsToClusters.hpp
@@ -1147,6 +1148,7 @@ set( ${PROJECT_NAME}_FOLDER_HDR_FILES
 	${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/ComputeLightsMortonCode.hpp
 	${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/FrustumClusters.hpp
 	${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/ReduceLightsAABB.hpp
+	${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/SortLightsMortonCode.hpp
 )
 set( ${PROJECT_NAME}_SRC_FILES
 	${${PROJECT_NAME}_SRC_FILES}
diff --git a/source/Core/Castor3D/DebugDefines.hpp b/source/Core/Castor3D/DebugDefines.hpp
index f5970f3f1e..a222885256 100644
--- a/source/Core/Castor3D/DebugDefines.hpp
+++ b/source/Core/Castor3D/DebugDefines.hpp
@@ -16,5 +16,6 @@ See LICENSE file in root folder
 #define C3D_DebugDisableSafeBands 0
 
 #define C3D_DebugUseLightsBVH 1
+#define C3D_DebugSortLightsMortonCode 1
 
 #endif
diff --git a/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp b/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp
index 4ad5400dca..7c945b8118 100644
--- a/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp
+++ b/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp
@@ -538,15 +538,42 @@ namespace castor3d
 				, crg::ComputePass{framePass
 					, context
 					, graph
-					, crg::ru::Config{}
+#if C3D_DebugUseLightsBVH && C3D_DebugSortLightsMortonCode
+					, crg::ru::Config{ 2u }
+#else
+					, crg::ru::Config{ 1u }
+#endif
 					, config
+						.getPassIndex( RunnablePass::GetPassIndexCallback( [this](){ return doGetPassIndex(); } ) )
 						.program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( CreateInfoHolder::getData() ) )
 						.enabled( &clusters.needsLightsUpdate() )
 						.end( RecordCallback{ [this]( crg::RecordContext & ctx, VkCommandBuffer cb, uint32_t idx ) { doPostRecord( ctx, cb, idx ); } } ) }
+				, m_lightCache{ clusters.getCamera().getScene()->getLightCache() }
 			{
 			}
 
 		private:
+			uint32_t doGetPassIndex()
+			{
+#if C3D_DebugUseLightsBVH && C3D_DebugSortLightsMortonCode
+				u32 result = {};
+
+				auto pointLightsCount = m_lightCache.getLightsCount( LightType::ePoint );
+				auto spoLightsCount = m_lightCache.getLightsCount( LightType::eSpot );
+				auto totalValues = std::max( pointLightsCount, spoLightsCount );
+				auto numChunks = getLightsMortonCodeChunkCount( totalValues );
+
+				if ( numChunks > 1u )
+				{
+					result = ( ( numChunks - 1u ) % 2u );
+				}
+
+				return result;
+#else
+				return 0u;
+#endif
+			}
+
 			void doPostRecord( crg::RecordContext & context
 				, VkCommandBuffer commandBuffer
 				, uint32_t index )
@@ -559,10 +586,10 @@ namespace castor3d
 						&& attach.isStorageBuffer()
 						&& attach.isClearableBuffer() )
 					{
-						auto currentState = context.getAccessState( buffer.buffer.buffer
+						auto currentState = context.getAccessState( buffer.buffer.buffer( index )
 							, buffer.range );
 						context.memoryBarrier( commandBuffer
-							, buffer.buffer.buffer
+							, buffer.buffer.buffer( index )
 							, buffer.range
 							, currentState.access
 							, currentState.pipelineStage
@@ -570,6 +597,9 @@ namespace castor3d
 					}
 				}
 			}
+
+		private:
+			LightCache const & m_lightCache;
 		};
 	}
 
@@ -610,8 +640,13 @@ namespace castor3d
 #if C3D_DebugUseLightsBVH
 		createInputStoragePassBinding( pass, uint32_t( dspclst::ePointLightBVH ), "C3D_PointLightsBVH", clusters.getPointLightBVHBuffer(), 0u, ashes::WholeSize );
 		createInputStoragePassBinding( pass, uint32_t( dspclst::eSpotLightBVH ), "C3D_SpotLightsBVH", clusters.getSpotLightBVHBuffer(), 0u, ashes::WholeSize );
+#	if C3D_DebugSortLightsMortonCode
+		createInputStoragePassBinding( pass, uint32_t( dspclst::ePointLightIndices ), "C3D_PointLightIndices", { &clusters.getOutputPointLightIndicesBuffer(), &clusters.getInputPointLightIndicesBuffer() }, 0u, ashes::WholeSize );
+		createInputStoragePassBinding( pass, uint32_t( dspclst::eSpotLightIndices ), "C3D_SpotLightIndices", { &clusters.getOutputSpotLightIndicesBuffer(), &clusters.getInputSpotLightIndicesBuffer() }, 0u, ashes::WholeSize );
+#	else
 		createInputStoragePassBinding( pass, uint32_t( dspclst::ePointLightIndices ), "C3D_PointLightIndices", clusters.getInputPointLightIndicesBuffer(), 0u, ashes::WholeSize );
 		createInputStoragePassBinding( pass, uint32_t( dspclst::eSpotLightIndices ), "C3D_SpotLightIndices", clusters.getInputSpotLightIndicesBuffer(), 0u, ashes::WholeSize );
+#	endif
 #endif
 		return pass;
 	}
diff --git a/source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp b/source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp
index 4c03577a58..59142c1f6f 100644
--- a/source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp
+++ b/source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp
@@ -1,5 +1,6 @@
 #include "Castor3D/Render/Clustered/BuildLightsBVH.hpp"
 
+#include "Castor3D/DebugDefines.hpp"
 #include "Castor3D/Engine.hpp"
 #include "Castor3D/Cache/LightCache.hpp"
 #include "Castor3D/Render/RenderDevice.hpp"
@@ -24,6 +25,8 @@
 #include <RenderGraph/FramePassGroup.hpp>
 #include <RenderGraph/RunnablePasses/ComputePass.hpp>
 
+#include <limits>
+
 namespace castor3d
 {
 	//*********************************************************************************************
@@ -42,6 +45,7 @@ namespace castor3d
 		};
 
 		static uint32_t constexpr NumThreads = 32u * 16u;
+		static float constexpr FltMax = std::numeric_limits< float >::max();
 
 		static ShaderPtr createShader( bool bottomLevel )
 		{
@@ -149,8 +153,8 @@ namespace castor3d
 						}
 						ELSE
 						{
-							aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f );
-							aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f );
+							aabbMin = vec4( sdw::Float{ FltMax }, FltMax, FltMax, 1.0f );
+							aabbMax = vec4( sdw::Float{ -FltMax }, -FltMax, -FltMax, 1.0f );
 						}
 						FI;
 
@@ -197,8 +201,8 @@ namespace castor3d
 						}
 						ELSE
 						{
-							aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f );
-							aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f );
+							aabbMin = vec4( sdw::Float{ FltMax }, FltMax, FltMax, 1.0f );
+							aabbMax = vec4( sdw::Float{ -FltMax }, -FltMax, -FltMax, 1.0f );
 						}
 						FI;
 
@@ -247,8 +251,8 @@ namespace castor3d
 						}
 						ELSE
 						{
-							aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f );
-							aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f );
+							aabbMin = vec4( sdw::Float{ FltMax }, FltMax, FltMax, 1.0f );
+							aabbMax = vec4( sdw::Float{ -FltMax }, -FltMax, -FltMax, 1.0f );
 						}
 						FI;
 
@@ -289,8 +293,8 @@ namespace castor3d
 						}
 						ELSE
 						{
-							aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f );
-							aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f );
+							aabbMin = vec4( sdw::Float{ FltMax }, FltMax, FltMax, 1.0f );
+							aabbMax = vec4( sdw::Float{ -FltMax }, -FltMax, -FltMax, 1.0f );
 						}
 						FI;
 
@@ -337,13 +341,18 @@ namespace castor3d
 					, { [this]( uint32_t index ){ doInitialise( index ); }
 						, GetPipelineStateCallback( [](){ return crg::getPipelineState( VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); } )
 						, [this]( crg::RecordContext & recContext, VkCommandBuffer cb, uint32_t i ){ doRecordInto( recContext, cb, i ); }
-						, GetPassIndexCallback( [](){ return 0u; } )
+						, GetPassIndexCallback( [this](){ return doGetPassIndex(); } )
 						, IsEnabledCallback( [this](){ return doIsEnabled(); } )
 						, IsComputePassCallback( [](){ return true; } ) }
+#if C3D_DebugSortLightsMortonCode
+					, crg::ru::Config{ 2u, true /* resettable */ } }
+#else
 					, crg::ru::Config{ 1u, true /* resettable */ } }
+#endif
+				, m_clusters{ clusters }
 				, m_lightCache{ clusters.getCamera().getScene()->getLightCache() }
-				, m_bottom{ framePass, context, graph, device, true, clusters }
-				, m_top{ framePass, context, graph, device, false, clusters }
+				, m_bottom{ framePass, context, graph, device, true, this }
+				, m_top{ framePass, context, graph, device, false, this }
 			{
 			}
 
@@ -371,12 +380,13 @@ namespace castor3d
 					, crg::RunnableGraph & graph
 					, RenderDevice const & device
 					, bool bottomLevel
-					, FrustumClusters const & clusters )
+					, FramePass * parent )
 					: shader{ VK_SHADER_STAGE_COMPUTE_BIT, "BuildLightsBVH", createShader( bottomLevel ) }
 					, createInfo{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, shader ) } }
-					, cpConfig{ crg::defaultV< uint32_t const * >
-						, &clusters.needsLightsUpdate()
+					, cpConfig{ crg::getDefaultV< InitialiseCallback >()
+						, &parent->m_clusters.needsLightsUpdate()
 						, crg::getDefaultV< IsEnabledCallback >()
+						, GetPassIndexCallback( [parent]() { return parent->doGetPassIndex(); } )
 						, crg::getDefaultV< RecordCallback >()
 						, crg::getDefaultV< RecordCallback >()
 						, 1u
@@ -386,15 +396,20 @@ namespace castor3d
 						, context
 						, graph
 						, crg::pp::Config{}
-						.program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( createInfo ) )
-					.pushConstants( VkPushConstantRange{ VK_SHADER_STAGE_COMPUTE_BIT, 0u, 4u } )
+							.program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( createInfo ) )
+							.pushConstants( VkPushConstantRange{ VK_SHADER_STAGE_COMPUTE_BIT, 0u, 4u } )
 						, VK_PIPELINE_BIND_POINT_COMPUTE
+#if C3D_DebugSortLightsMortonCode
+						, 2u }
+#else
 						, 1u }
+#endif
 				{
 				}
 			};
 
 		private:
+			FrustumClusters const & m_clusters;
 			LightCache const & m_lightCache;
 			Pipeline m_bottom;
 			Pipeline m_top;
@@ -408,6 +423,27 @@ namespace castor3d
 				doCreatePipeline( index, m_top );
 			}
 
+			uint32_t doGetPassIndex()
+			{
+#if C3D_DebugSortLightsMortonCode
+				u32 result = {};
+
+				auto pointLightsCount = m_lightCache.getLightsCount( LightType::ePoint );
+				auto spoLightsCount = m_lightCache.getLightsCount( LightType::eSpot );
+				auto totalValues = std::max( pointLightsCount, spoLightsCount );
+				auto numChunks = getLightsMortonCodeChunkCount( totalValues );
+
+				if ( numChunks > 1u )
+				{
+					result = ( ( numChunks - 1u ) % 2u );
+				}
+
+				return result;
+#else
+				return 0u;
+#endif
+			}
+
 			bool doIsEnabled()const
 			{
 				return ( m_bottom.cpConfig.isEnabled ? ( *m_bottom.cpConfig.isEnabled )() : false )
@@ -422,11 +458,11 @@ namespace castor3d
 				auto pointLightsCount = m_lightCache.getLightsCount( LightType::ePoint );
 				auto spoLightsCount = m_lightCache.getLightsCount( LightType::eSpot );
 				auto maxLeaves = std::max( pointLightsCount, spoLightsCount );
-				auto numThreadGroups = uint32_t( std::ceil( float( maxLeaves ) / float( 32u * 16u ) ) );
+				auto numThreadGroups = uint32_t( std::ceil( float( maxLeaves ) / float( NumThreads ) ) );
 				m_bottom.pipeline.recordInto( context, commandBuffer, index );
 				m_context.vkCmdDispatch( commandBuffer, numThreadGroups, 1u, 1u );
 				uint32_t maxLevels = FrustumClusters::getNumLevels( maxLeaves );
-				doBarriers( context, commandBuffer, 0 );
+				doBarriers( context, commandBuffer, index, 0 );
 
 				// Now build upper levels of the BVH.
 				if ( maxLevels > 1u )
@@ -435,7 +471,7 @@ namespace castor3d
 
 					for ( uint32_t level = maxLevels - 1u; level > 0; --level )
 					{
-						doBarriers( context, commandBuffer, 1 );
+						doBarriers( context, commandBuffer, index, 1 );
 						m_context.vkCmdPushConstants( commandBuffer, m_top.pipeline.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0u, 4u, &level );
 						uint32_t numChildNodes = FrustumClusters::getNumLevelNodes( level );
 						numThreadGroups = uint32_t( std::ceil( float( numChildNodes ) / float( NumThreads ) ) );
@@ -443,11 +479,12 @@ namespace castor3d
 					}
 				}
 
-				doBarriers( context, commandBuffer, 2 );
+				doBarriers( context, commandBuffer, index, 2 );
 			}
 
 			void doBarriers( crg::RecordContext & context
 				, VkCommandBuffer commandBuffer
+				, uint32_t passIndex
 				, int idx )
 			{
 				for ( auto & attach : m_pass.buffers )
@@ -458,10 +495,10 @@ namespace castor3d
 						&& attach.isStorageBuffer()
 						&& attach.isClearableBuffer() )
 					{
-						auto currentState = context.getAccessState( buffer.buffer.buffer
+						auto currentState = context.getAccessState( buffer.buffer.buffer( passIndex )
 							, buffer.range );
 						context.memoryBarrier( commandBuffer
-							, buffer.buffer.buffer
+							, buffer.buffer.buffer( passIndex )
 							, buffer.range
 							, currentState.access
 							, currentState.pipelineStage
@@ -492,7 +529,7 @@ namespace castor3d
 	//*********************************************************************************************
 
 	crg::FramePass const & createBuildLightsBVHPass( crg::FramePassGroup & graph
-		, crg::FramePass const * previousPass
+		, crg::FramePassArray const & previousPasses
 		, RenderDevice const & device
 		, CameraUbo const & cameraUbo
 		, FrustumClusters & clusters )
@@ -511,13 +548,18 @@ namespace castor3d
 					, result->getTimer() );
 				return result;
 			} );
-		pass.addDependency( *previousPass );
+		pass.addDependencies( previousPasses );
 		cameraUbo.createPassBinding( pass, lgtbvh::eCamera );
 		auto & lights = clusters.getCamera().getScene()->getLightCache();
 		lights.createPassBinding( pass, lgtbvh::eLights );
 		clusters.getClustersUbo().createPassBinding( pass, lgtbvh::eClusters );
+#if C3D_DebugSortLightsMortonCode
+		createInputStoragePassBinding( pass, uint32_t( lgtbvh::ePointLightIndices ), "C3D_PointLightIndices", { &clusters.getOutputPointLightIndicesBuffer(), &clusters.getInputPointLightIndicesBuffer() }, 0u, ashes::WholeSize );
+		createInputStoragePassBinding( pass, uint32_t( lgtbvh::eSpotLightIndices ), "C3D_SpotLightIndices", { &clusters.getOutputSpotLightIndicesBuffer(), &clusters.getInputSpotLightIndicesBuffer() }, 0u, ashes::WholeSize );
+#else
 		createInputStoragePassBinding( pass, uint32_t( lgtbvh::ePointLightIndices ), "C3D_PointLightIndices", clusters.getInputPointLightIndicesBuffer(), 0u, ashes::WholeSize );
 		createInputStoragePassBinding( pass, uint32_t( lgtbvh::eSpotLightIndices ), "C3D_SpotLightIndices", clusters.getInputSpotLightIndicesBuffer(), 0u, ashes::WholeSize );
+#endif
 		createClearableOutputStorageBinding( pass, uint32_t( lgtbvh::ePointLightBVH ), "C3D_PointLightBVH", clusters.getPointLightBVHBuffer(), 0u, ashes::WholeSize );
 		createClearableOutputStorageBinding( pass, uint32_t( lgtbvh::eSpotLightBVH ), "C3D_SpotLightBVH", clusters.getSpotLightBVHBuffer(), 0u, ashes::WholeSize );
 		return pass;
diff --git a/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp b/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp
index afc2e08506..1db32ca1bf 100644
--- a/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp
+++ b/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp
@@ -186,10 +186,10 @@ namespace castor3d
 						&& attach.isStorageBuffer()
 						&& attach.isClearableBuffer() )
 					{
-						auto currentState = context.getAccessState( buffer.buffer.buffer
+						auto currentState = context.getAccessState( buffer.buffer.buffer( index )
 							, buffer.range );
 						context.memoryBarrier( commandBuffer
-							, buffer.buffer.buffer
+							, buffer.buffer.buffer( index )
 							, buffer.range
 							, currentState.access
 							, currentState.pipelineStage
diff --git a/source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp b/source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp
index 46e83a1e9b..e5d06af710 100644
--- a/source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp
+++ b/source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp
@@ -1,5 +1,6 @@
 #include "Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp"
 
+#include "Castor3D/DebugDefines.hpp"
 #include "Castor3D/Engine.hpp"
 #include "Castor3D/Cache/LightCache.hpp"
 #include "Castor3D/Render/RenderDevice.hpp"
@@ -107,7 +108,7 @@ namespace castor3d
 						bitShift += 2u;
 					}
 
-					return mortonCode;
+					writer.returnStmt( mortonCode );
 				}
 				, sdw::InUVec3{ writer, "quantizedCoord" } );
 
@@ -175,11 +176,22 @@ namespace castor3d
 			using ShaderHolder = DataHolderT< ShaderModule >;
 			using CreateInfoHolder = DataHolderT< ashes::PipelineShaderStageCreateInfoArray >;
 
+			void doInitClustersBuffersIndices()const
+			{
+#if C3D_DebugSortLightsMortonCode
+				m_clusters.initPointLightMortonIndicesIO();
+				m_clusters.initSpotLightMortonIndicesIO();
+				m_clusters.swapPointLightMortonIndicesIO();
+				m_clusters.swapSpotLightMortonIndicesIO();
+#endif
+			}
+
 		public:
 			FramePass( crg::FramePass const & framePass
 				, crg::GraphContext & context
 				, crg::RunnableGraph & graph
 				, RenderDevice const & device
+				, FrustumClusters & clusters
 				, crg::cp::Config config )
 				: ShaderHolder{ ShaderModule{ VK_SHADER_STAGE_COMPUTE_BIT, "ComputeLightsMortonCode", createShader() } }
 				, CreateInfoHolder{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, ShaderHolder::getData() ) } }
@@ -188,9 +200,13 @@ namespace castor3d
 					, graph
 					, crg::ru::Config{}
 					, config
+						.getPassIndex( GetPassIndexCallback{ [this]() { doInitClustersBuffersIndices(); return 0u; } } )
+						.initialise( InitialiseCallback{ [this]( uint32_t idx ) { doInitClustersBuffersIndices(); } } )
 						.program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( CreateInfoHolder::getData() ) )
 						.end( RecordCallback{ [this]( crg::RecordContext & ctx, VkCommandBuffer cb, uint32_t idx ) { doPostRecord( ctx, cb, idx ); } } ) }
+				, m_clusters{ clusters }
 			{
+				doInitClustersBuffersIndices();
 			}
 
 		private:
@@ -206,17 +222,21 @@ namespace castor3d
 						&& attach.isStorageBuffer()
 						&& attach.isClearableBuffer() )
 					{
-						auto currentState = context.getAccessState( buffer.buffer.buffer
+						auto currentState = context.getAccessState( buffer.buffer.buffer( index )
 							, buffer.range );
 						context.memoryBarrier( commandBuffer
-							, buffer.buffer.buffer
+							, buffer.buffer.buffer( index )
 							, buffer.range
 							, currentState.access
 							, currentState.pipelineStage
 							, { VK_ACCESS_SHADER_READ_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT } );
 					}
 				}
+
+				doInitClustersBuffersIndices();
 			}
+
+			FrustumClusters & m_clusters;
 		};
 	}
 
@@ -238,6 +258,7 @@ namespace castor3d
 					, context
 					, graph
 					, device
+					, clusters
 					, crg::cp::Config{}
 						.groupCountX( numThreadGroups )
 						.enabled( &clusters.needsClustersUpdate() ) );
@@ -251,11 +272,18 @@ namespace castor3d
 		lights.createPassBinding( pass, cmpmrt::eLights );
 		clusters.getClustersUbo().createPassBinding( pass, cmpmrt::eClusters );
 		createInputStoragePassBinding( pass, uint32_t( cmpmrt::eLightsAABB ), "C3D_LightsAABB", clusters.getLightsAABBBuffer(), 0u, ashes::WholeSize );
+#if C3D_DebugSortLightsMortonCode
+		clusters.initPointLightMortonIndicesIO();
+		clusters.initSpotLightMortonIndicesIO();
+#endif
 		createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::ePointLightMortonCodes ), "C3D_PointLightMortonCodes", clusters.getOutputPointLightMortonCodesBuffer(), 0u, ashes::WholeSize );
 		createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::eSpotLightMortonCodes ), "C3D_SpotLightMortonCodes", clusters.getOutputSpotLightMortonCodesBuffer(), 0u, ashes::WholeSize );
 		createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::ePointLightIndices ), "C3D_PointLightIndices", clusters.getOutputPointLightIndicesBuffer(), 0u, ashes::WholeSize );
 		createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::eSpotLightIndices ), "C3D_SpotLightIndices", clusters.getOutputSpotLightIndicesBuffer(), 0u, ashes::WholeSize );
-		clusters.swapLightMortonIndicesIO();
+#if C3D_DebugSortLightsMortonCode
+		clusters.swapPointLightMortonIndicesIO();
+		clusters.swapSpotLightMortonIndicesIO();
+#endif
 		return pass;
 	}
 
diff --git a/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp b/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp
index b0785a357a..e78c23a43e 100644
--- a/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp
+++ b/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp
@@ -10,6 +10,7 @@
 #include "Castor3D/Render/Clustered/ComputeClustersAABB.hpp"
 #include "Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp"
 #include "Castor3D/Render/Clustered/ReduceLightsAABB.hpp"
+#include "Castor3D/Render/Clustered/SortLightsMortonCode.hpp"
 #include "Castor3D/Scene/Camera.hpp"
 #include "Castor3D/Scene/Scene.hpp"
 #include "Castor3D/Scene/Light/DirectionalLight.hpp"
@@ -118,6 +119,40 @@ namespace castor3d
 			, "C3D_SpotLightBVH" ) }
 #endif
 	{
+#if C3D_DebugUseLightsBVH
+#	if C3D_DebugSortLightsMortonCode
+		static uint32_t constexpr NumThreadsPerThreadGroup = 256u;
+		static uint32_t constexpr ElementsPerThread = 8u;
+
+		// The maximum number of elements that need to be sorted.
+		uint32_t maxElements = MaxLightsCount;
+
+		// Radix sort will sort Morton codes (keys) into chunks of SORT_NUM_THREADS_PER_THREAD_GROUP size.
+		uint32_t chunkSize = NumThreadsPerThreadGroup;
+		// The number of chunks that need to be merge sorted after Radix sort finishes.
+		uint32_t numChunks = uint32_t( std::ceil( float( maxElements ) / float( chunkSize ) ) );
+		// The number of sort groups that are needed to sort the first set of chunks.
+		// Each sort group will sort 2 chunks. So the maximum number of sort groups is 1/2 of the 
+		// number of chunks.
+		uint32_t maxSortGroups = numChunks / 2u;
+		// The number of merge path partitions per sort group is the total values
+		// to be sorted per sort group (2 chunks) divided by the number of elements 
+		// that can be sorted per thread group. One is added to account for the 
+		// merge path partition at the END of the chunk.
+		uint32_t numMergePathPartitionsPerSortGroup = uint32_t( std::ceil( float( chunkSize * 2u ) / float( ElementsPerThread * NumThreadsPerThreadGroup ) ) ) + 1u;
+
+		// The maximum number of merge path partitions is the number of merge path partitions
+		// needed by a single sort group multiplied by the maximum number of sort groups.
+		uint32_t maxMergePathPartitions = numMergePathPartitionsPerSortGroup * maxSortGroups;
+
+		m_mergePathPartitionsBuffer = makeBufferBase( m_device
+			, sizeof( u32 ) * maxMergePathPartitions
+			, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT
+			, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+			, "C3D_MergePathPartitions" );
+#	endif
+#endif
+
 		doUpdate();
 	}
 
@@ -145,17 +180,21 @@ namespace castor3d
 		, CameraUbo const & cameraUbo )
 	{
 		auto & graph = parentGraph.createPassGroup( "Clusters" );
-		auto lastPass = &createComputeClustersAABBPass( graph, previousPass
-			, m_device, cameraUbo, *this );
+		crg::FramePassArray lastPasses = { &createComputeClustersAABBPass( graph, previousPass
+			, m_device, cameraUbo, *this ) };
 #if C3D_DebugUseLightsBVH
-		lastPass = &createReduceLightsAABBPass( graph, lastPass
-			, m_device, cameraUbo, *this );
-		lastPass = &createComputeLightsMortonCodePass( graph, lastPass
-			, m_device, cameraUbo, *this );
-		lastPass = &createBuildLightsBVHPass( graph, lastPass
+		lastPasses = { &createReduceLightsAABBPass( graph, lastPasses.front()
+			, m_device, cameraUbo, *this ) };
+		lastPasses = { &createComputeLightsMortonCodePass( graph, lastPasses.front()
+			, m_device, cameraUbo, *this ) };
+#	if C3D_DebugSortLightsMortonCode
+		lastPasses = createSortLightsMortonCodePass( graph, lastPasses.front()
 			, m_device, cameraUbo, *this );
+#	endif
+		lastPasses = { &createBuildLightsBVHPass( graph, lastPasses
+			, m_device, cameraUbo, *this ) };
 #endif
-		return createAssignLightsToClustersPass( graph, lastPass
+		return createAssignLightsToClustersPass( graph, lastPasses.front()
 			, m_device, cameraUbo, *this );
 	}
 
diff --git a/source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp b/source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp
index 53c7b70987..af7e6be5cd 100644
--- a/source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp
+++ b/source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp
@@ -25,6 +25,8 @@
 #include <RenderGraph/FramePassGroup.hpp>
 #include <RenderGraph/RunnablePasses/ComputePass.hpp>
 
+#include <limits>
+
 #define C3D_DebugEnableWarpOptimisation 0
 
 namespace castor3d
@@ -42,6 +44,7 @@ namespace castor3d
 		};
 
 		static uint32_t constexpr NumThreads = 512u;
+		static float constexpr FltMax = std::numeric_limits< float >::max();
 
 		static ShaderPtr createShader( bool first )
 		{
@@ -151,9 +154,9 @@ namespace castor3d
 					auto threadIndex = in.globalInvocationID.x();
 
 					auto aabbMin = writer.declLocale( "aabbMin"
-						, vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f ) );
+						, vec4( sdw::Float{ FltMax }, FltMax, FltMax, 1.0f ) );
 					auto aabbMax = writer.declLocale( "aabbMax"
-						, vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f ) );
+						, vec4( sdw::Float{ -FltMax }, -FltMax, -FltMax, 1.0f ) );
 
 					if ( first )
 					{
@@ -277,9 +280,10 @@ namespace castor3d
 					, FrustumClusters const & clusters )
 					: shader{ VK_SHADER_STAGE_COMPUTE_BIT, "ReduceLightsAABB" + ( first ? std::string{ "/First" } : std::string{ "/Second" } ), createShader( first ) }
 					, createInfo{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, shader ) } }
-					, cpConfig{ crg::defaultV< uint32_t const * >
+					, cpConfig{ crg::getDefaultV< InitialiseCallback >()
 						, &clusters.needsLightsUpdate()
 						, crg::getDefaultV< IsEnabledCallback >()
+						, crg::getDefaultV< GetPassIndexCallback >()
 						, crg::getDefaultV< RecordCallback >()
 						, crg::getDefaultV< RecordCallback >()
 						, 1u
diff --git a/source/Core/Castor3D/Render/Clustered/SortLightsMortonCode.cpp b/source/Core/Castor3D/Render/Clustered/SortLightsMortonCode.cpp
new file mode 100644
index 0000000000..470c8e21d0
--- /dev/null
+++ b/source/Core/Castor3D/Render/Clustered/SortLightsMortonCode.cpp
@@ -0,0 +1,970 @@
+#include "Castor3D/Render/Clustered/SortLightsMortonCode.hpp"
+
+#include "Castor3D/Engine.hpp"
+#include "Castor3D/Cache/LightCache.hpp"
+#include "Castor3D/Render/RenderDevice.hpp"
+#include "Castor3D/Render/RenderSystem.hpp"
+#include "Castor3D/Render/Clustered/FrustumClusters.hpp"
+#include "Castor3D/Scene/Camera.hpp"
+#include "Castor3D/Scene/Scene.hpp"
+#include "Castor3D/Scene/Light/PointLight.hpp"
+#include "Castor3D/Scene/Light/SpotLight.hpp"
+#include "Castor3D/Shader/Program.hpp"
+#include "Castor3D/Shader/Shaders/GlslAABB.hpp"
+#include "Castor3D/Shader/Shaders/GlslAppendBuffer.hpp"
+#include "Castor3D/Shader/Shaders/GlslClusteredLights.hpp"
+#include "Castor3D/Shader/Shaders/GlslLight.hpp"
+#include "Castor3D/Shader/Shaders/GlslUtils.hpp"
+#include "Castor3D/Shader/Ubos/CameraUbo.hpp"
+#include "Castor3D/Shader/Ubos/ClustersUbo.hpp"
+
+#include <CastorUtils/Design/DataHolder.hpp>
+
+#include <ShaderWriter/Source.hpp>
+
+#include <RenderGraph/FramePassGroup.hpp>
+#include <RenderGraph/RunnablePasses/ComputePass.hpp>
+
+namespace castor3d
+{
+	//*********************************************************************************************
+
+	namespace srtmrt
+	{
+		static uint32_t constexpr NumThreads = 256u;
+		static uint32_t constexpr NumThreadsPerThreadGroup = 256u;
+		static uint32_t constexpr NumValuesPerThread = 8u;
+		static uint32_t constexpr NumValuesPerThreadGroup = NumThreads * NumValuesPerThread;
+
+		struct DispatchData
+		{
+			u32 numElements;
+			u32 chunkSize;
+		};
+
+		using InMortonCodesCallback = ashes::BufferBase & ( FrustumClusters::* )()const noexcept;
+		using OutMortonCodesCallback = ashes::BufferBase & ( FrustumClusters::* )()const noexcept;
+		using InIndicesCallback = ashes::BufferBase & ( FrustumClusters::* )()const noexcept;
+		using OutIndicesCallback = ashes::BufferBase & ( FrustumClusters::* )()const noexcept;
+
+		struct LightData
+		{
+			LightType lightType;
+			std::string name;
+			InMortonCodesCallback m_inMortonCodes;
+			OutMortonCodesCallback m_outMortonCodes;
+			InIndicesCallback m_inIndices;
+			OutIndicesCallback m_outIndices;
+
+			ashes::BufferBase & inMortonCodes( FrustumClusters & clusters )
+			{
+				return ( clusters.*m_inMortonCodes )();
+			}
+
+			ashes::BufferBase & outMortonCodes( FrustumClusters & clusters )
+			{
+				return ( clusters.*m_outMortonCodes )();
+			}
+
+			ashes::BufferBase & inIndices( FrustumClusters & clusters )
+			{
+				return ( clusters.*m_inIndices )();
+			}
+
+			ashes::BufferBase & outIndices( FrustumClusters & clusters )
+			{
+				return ( clusters.*m_outIndices )();
+			}
+		};
+
+		//*****************************************************************************************
+
+		namespace radix
+		{
+			enum BindingPoints
+			{
+				eInputKeys,
+				eInputValues,
+				eOutputKeys,
+				eOutputValues,
+			};
+
+			static ShaderPtr createShader()
+			{
+				sdw::ComputeWriter writer;
+
+				auto inputKeysBuffer = writer.declStorageBuffer( "c3d_inputKeysBuffer"
+					, uint32_t( eInputKeys )
+					, 0u );
+				auto c3d_inputKeys = inputKeysBuffer.declMemberArray< sdw::UInt >( "ik" );
+				inputKeysBuffer.end();
+
+				auto inputValuesBuffer = writer.declStorageBuffer( "c3d_inputValuesBuffer"
+					, uint32_t( eInputValues )
+					, 0u );
+				auto c3d_inputValues = inputValuesBuffer.declMemberArray< sdw::UInt >( "iv" );
+				inputValuesBuffer.end();
+
+				auto outputKeysBuffer = writer.declStorageBuffer( "c3d_outputKeysBuffer"
+					, uint32_t( eOutputKeys )
+					, 0u );
+				auto c3d_outputKeys = outputKeysBuffer.declMemberArray< sdw::UInt >( "ok" );
+				outputKeysBuffer.end();
+
+				auto outputValuesBuffer = writer.declStorageBuffer( "c3d_outputValuesBuffer"
+					, uint32_t( eOutputValues )
+					, 0u );
+				auto c3d_outputValues = outputValuesBuffer.declMemberArray< sdw::UInt >( "ov" );
+				outputValuesBuffer.end();
+
+				sdw::PushConstantBuffer pcb{ writer, "C3D_DispatchData", "c3d_dispatchData" };
+				auto c3d_numElements = pcb.declMember< sdw::UInt >( "c3d_numElements" );
+				auto c3d_chunkSize = pcb.declMember< sdw::UInt >( "c3d_chunkSize" );
+				pcb.end();
+
+				auto gsKeys = writer.declSharedVariable< sdw::UInt >( "gsKeys", NumThreads );		// A temporary buffer to store the input keys.                                          (1,024 Bytes)
+				auto gsValues = writer.declSharedVariable< sdw::UInt >( "gsValues", NumThreads );	// A temporary buffer to store the input values.                                        (1,024 Bytes)
+				auto gsE = writer.declSharedVariable< sdw::UInt >( "gsE", NumThreads );				// Set a 1 for all false sort keys (b == 0) and a 0 for all true sort keys (b == 1)     (1,024 Bytes)
+				auto gsF = writer.declSharedVariable< sdw::UInt >( "gsF", NumThreads );				// Scan the splits. This results in the output index of all false sort keys (b == 0)    (1,024 Bytes)
+				auto gsD = writer.declSharedVariable< sdw::UInt >( "gsD", NumThreads );				// The desination index for the ouput key and value.                                    (1,024 Bytes)
+				auto gsTotalFalses = writer.declSharedVariable< sdw::UInt >( "gsTotalFalses" );				// The result of e[NUM_THREADS - 1] + f[NUM_THREADS - 1];                               (4 Bytes)
+
+				writer.implementMainT< sdw::VoidT >( NumThreads
+					, [&]( sdw::ComputeIn in )
+					{
+						// The number of bits to consider sorting.
+						// In this case, the input keys are 30-bit morton codes.
+						const u32 NumBits = 30u;
+
+						auto groupIndex = in.localInvocationIndex;
+						auto threadIndex = in.globalInvocationID.x();
+
+						// Store the input key and values into shared memory.
+						gsKeys[groupIndex] = writer.ternary( threadIndex < c3d_numElements, c3d_inputKeys[threadIndex], sdw::UInt{ UINT_MAX } );
+						gsValues[groupIndex] = writer.ternary( threadIndex < c3d_numElements, c3d_inputValues[threadIndex], sdw::UInt{ UINT_MAX } );
+
+						// Loop over the bits starting at the least-significant bit.
+						FOR( writer, sdw::UInt, b, 0_u, b < NumBits, ++b )
+						{
+							// 1. In a temporary buffer in shared memory, we set a 1 for all false 
+							//    sort keys (b = 0) and a 0 for all true sort keys.
+							gsE[groupIndex] = writer.ternary( ( ( gsKeys[groupIndex] >> b ) & 1_u ) == 0_u
+								, 1_u
+								, 0_u );
+
+							// Sync group shared memory writes.
+							shader::groupMemoryBarrierWithGroupSync( writer );
+
+							IF( writer, groupIndex == 0_u )
+							{
+								gsF[groupIndex] = 0_u;
+							}
+							ELSE
+							{
+								gsF[groupIndex] = gsE[groupIndex - 1_u];
+							}
+							FI;
+
+							// Sync group shared memory writes.
+							shader::groupMemoryBarrierWithGroupSync( writer );
+							auto temp = writer.declLocale( "temp", 0_u );
+
+							// 2. We then scan (prefix sum) this buffer. This is the enumerate operation; 
+							//    each false sort key now contains its destination address in the scan 
+							//    output, which we will call f. These first two steps are equivalent to 
+							//    a stream compaction operation on all false sort keys.
+							for ( u32 i = 1; i < NumThreads; i <<= 1u )
+							{
+								temp = gsF[groupIndex];
+
+								IF( writer, groupIndex > i )
+								{
+									temp += gsF[groupIndex - i];
+								}
+								FI;
+
+								// Sync group shared memory reads before writes.
+								shader::groupMemoryBarrierWithGroupSync( writer );
+
+								gsF[groupIndex] = temp;
+
+								// Sync group shared memory writes.
+								shader::groupMemoryBarrierWithGroupSync( writer );
+							}
+
+							// 3. The last element in the scan's output now contains the total 
+							//    number of false sort keys. We write this value to a shared 
+							//    variable, gs_TotalFalses.
+							IF ( writer, groupIndex == 0_u )
+							{
+								gsTotalFalses = gsE[NumThreads - 1u] + gsF[NumThreads - 1u];
+							}
+							FI;
+
+							// Sync group shared memory writes.
+							shader::groupMemoryBarrierWithGroupSync( writer );
+
+							// 4. Now we compute the destination address for the true sort keys. For 
+							// a sort key at index i, this address is t = i - f + totalFalses. We 
+							// then select between t and f depending on the value of b to get the 
+							// destination address d of each fragment.
+							gsD[groupIndex] = writer.ternary( gsE[groupIndex] == 1u
+								, gsF[groupIndex]
+								, groupIndex - gsF[groupIndex] + gsTotalFalses );
+
+							// 5. Finally, we scatter the original sort keys to destination address 
+							//    d. The scatter pattern is a perfect permutation of the input, so 
+							//    we see no write conflicts with this scatter.
+							auto key = writer.declLocale( "key", gsKeys[groupIndex] );
+							auto value = writer.declLocale( "value", gsValues[groupIndex] );
+
+							// Sync group shared memory reads before writes.
+							shader::groupMemoryBarrierWithGroupSync( writer );
+
+							gsKeys[gsD[groupIndex]] = key;
+							gsValues[gsD[groupIndex]] = value;
+
+							// Sync group shared memory writes.
+							shader::groupMemoryBarrierWithGroupSync( writer );
+						}
+						ROF;
+
+						// Now commit the results to global memory.
+						c3d_outputKeys[threadIndex] = gsKeys[groupIndex];
+						c3d_outputValues[threadIndex] = gsValues[groupIndex];
+					} );
+				return std::make_unique< ast::Shader >( std::move( writer.getShader() ) );
+			}
+
+			class FramePass
+				: public crg::RunnablePass
+			{
+			public:
+				FramePass( crg::FramePass const & framePass
+					, crg::GraphContext & context
+					, crg::RunnableGraph & graph
+					, RenderDevice const & device
+					, FrustumClusters & clusters
+					, LightData lightData )
+					: crg::RunnablePass{ framePass
+						, context
+						, graph
+						, { [this]( uint32_t index ){ doInitialise( index ); }
+							, GetPipelineStateCallback( [](){ return crg::getPipelineState( VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); } )
+							, [this]( crg::RecordContext & recContext, VkCommandBuffer cb, uint32_t i ){ doRecordInto( recContext, cb, i ); }
+							, GetPassIndexCallback( [](){ return 0u; } )
+							, IsEnabledCallback( [this](){ return doIsEnabled(); } )
+							, IsComputePassCallback( [](){ return true; } ) }
+						, crg::ru::Config{ 1u, true /* resettable */ } }
+					, m_clusters{ clusters }
+					, m_lightCache{ clusters.getCamera().getScene()->getLightCache() }
+					, m_lightData{ std::move( lightData ) }
+					, m_pipeline{ framePass, context, graph, device, this }
+				{
+				}
+
+				CRG_API void resetPipeline( crg::VkPipelineShaderStageCreateInfoArray config
+					, uint32_t index )
+				{
+					resetCommandBuffer( index );
+					m_pipeline.pipeline.resetPipeline( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( m_pipeline.createInfo ), index );
+					doCreatePipeline( index, m_pipeline );
+					reRecordCurrent();
+				}
+
+			private:
+				struct Pipeline
+				{
+					ShaderModule shader;
+					ashes::PipelineShaderStageCreateInfoArray createInfo;
+					crg::cp::ConfigData cpConfig;
+					crg::PipelineHolder pipeline;
+
+					Pipeline( crg::FramePass const & framePass
+						, crg::GraphContext & context
+						, crg::RunnableGraph & graph
+						, RenderDevice const & device
+						, FramePass * parent )
+						: shader{ VK_SHADER_STAGE_COMPUTE_BIT, "RadixSort", createShader() }
+						, createInfo{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, shader ) } }
+						, cpConfig{ crg::getDefaultV< InitialiseCallback >()
+							, nullptr
+							, IsEnabledCallback( [parent]() { return parent->doIsEnabled(); } )
+							, GetPassIndexCallback( []() { return 0u; } )
+							, crg::getDefaultV< RecordCallback >()
+							, crg::getDefaultV< RecordCallback >()
+							, 1u
+							, 1u
+							, 1u }
+						, pipeline{ framePass
+							, context
+							, graph
+							, crg::pp::Config{}
+								.program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( createInfo ) )
+								.pushConstants( VkPushConstantRange{ VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u } )
+							, VK_PIPELINE_BIND_POINT_COMPUTE
+							, 1u }
+					{
+					}
+				};
+
+			private:
+				FrustumClusters & m_clusters;
+				LightCache const & m_lightCache;
+				LightData m_lightData;
+				Pipeline m_pipeline;
+
+			private:
+				void doInitialise( uint32_t index )
+				{
+					m_pipeline.pipeline.initialise();
+					doCreatePipeline( index, m_pipeline );
+				}
+
+				bool doIsEnabled()const
+				{
+					return m_clusters.needsLightsUpdate()
+						&& m_lightCache.getLightsCount( m_lightData.lightType ) > 0;
+				}
+
+				void doRecordInto( crg::RecordContext & context
+					, VkCommandBuffer commandBuffer
+					, uint32_t index )
+				{
+					// Build bottom level of the BVH.
+					auto lightsCount = m_lightCache.getLightsCount( m_lightData.lightType );
+					auto numThreadGroups = uint32_t( std::ceil( float( lightsCount ) / float( NumThreadsPerThreadGroup ) ) );
+					DispatchData data{ lightsCount, 0u };
+					m_pipeline.pipeline.recordInto( context, commandBuffer, index );
+					m_context.vkCmdPushConstants( commandBuffer, m_pipeline.pipeline.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u, &data );
+					m_context.vkCmdDispatch( commandBuffer, numThreadGroups, 1u, 1u );
+				}
+
+				void doCreatePipeline( uint32_t index
+					, Pipeline & pipeline )
+				{
+					auto & program = pipeline.pipeline.getProgram( index );
+					VkComputePipelineCreateInfo createInfo{ VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO
+						, nullptr
+						, 0u
+						, program.front()
+						, pipeline.pipeline.getPipelineLayout()
+						, VkPipeline{}
+						, 0u };
+					pipeline.pipeline.createPipeline( index, createInfo );
+				}
+			};
+		}
+
+		namespace merge
+		{
+			enum BindingPoints
+			{
+				eInputKeys,
+				eInputValues,
+				eInputMergePathPartitions,
+				eOutputKeys,
+				eOutputValues,
+				eOutputMergePathPartitions,
+			};
+
+			static ShaderPtr createShader( bool mergePathPartitions )
+			{
+				sdw::ComputeWriter writer;
+
+				auto inputKeysBuffer = writer.declStorageBuffer( "c3d_inputKeysBuffer"
+					, uint32_t( eInputKeys )
+					, 0u );
+				auto c3d_inputKeys = inputKeysBuffer.declMemberArray< sdw::UInt >( "ik" );
+				inputKeysBuffer.end();
+
+				auto inputValuesBuffer = writer.declStorageBuffer( "c3d_inputValuesBuffer"
+					, uint32_t( eInputValues )
+					, 0u );
+				auto c3d_inputValues = inputValuesBuffer.declMemberArray< sdw::UInt >( "iv" );
+				inputValuesBuffer.end();
+
+				auto inputMergePathPartitionsBuffer = writer.declStorageBuffer( "c3d_inputMergePathPartitionsBuffer"
+					, uint32_t( eInputMergePathPartitions )
+					, 0u );
+				auto c3d_inputMergePathPartitions = inputMergePathPartitionsBuffer.declMemberArray< sdw::Int >( "ip" );
+				inputMergePathPartitionsBuffer.end();
+
+				auto outputKeysBuffer = writer.declStorageBuffer( "c3d_outputKeysBuffer"
+					, uint32_t( eOutputKeys )
+					, 0u );
+				auto c3d_outputKeys = outputKeysBuffer.declMemberArray< sdw::UInt >( "ok" );
+				outputKeysBuffer.end();
+
+				auto outputValuesBuffer = writer.declStorageBuffer( "c3d_outputValuesBuffer"
+					, uint32_t( eOutputValues )
+					, 0u );
+				auto c3d_outputValues = outputValuesBuffer.declMemberArray< sdw::UInt >( "ov" );
+				outputValuesBuffer.end();
+
+				auto outputMergePathPartitionsBuffer = writer.declStorageBuffer( "c3d_outputMergePathPartitionsBuffer"
+					, uint32_t( eOutputMergePathPartitions )
+					, 0u );
+				auto c3d_outputMergePathPartitions = outputMergePathPartitionsBuffer.declMemberArray< sdw::Int >( "op" );
+				outputMergePathPartitionsBuffer.end();
+
+				sdw::PushConstantBuffer pcb{ writer, "C3D_DispatchData", "c3d_dispatchData" };
+				auto c3d_numElements = pcb.declMember< sdw::UInt >( "c3d_numElements" );
+				auto c3d_chunkSize = pcb.declMember< sdw::UInt >( "c3d_chunkSize" );
+				pcb.end();
+
+				auto gsKeys = writer.declSharedVariable< sdw::UInt >( "gsKeys", NumValuesPerThreadGroup );		// Intermediate keys.		(8,192 Bytes)
+				auto gsValues = writer.declSharedVariable< sdw::UInt >( "gsValues", NumValuesPerThreadGroup );	// Intermediate values.		(8,192 Bytes)
+
+				/**
+				 * MergePath is a binary search over two sorted arrays that finds the
+				 * point in list A and list B to begin a merge operation.
+				 * Based on: https://moderngpu.github.io/bulkinsert.html#mergepath
+				 * Retrieved on: Aug 9, 2016.
+				 *
+				 * @param a0 The first element in list A.
+				 * @param aCount The number of elements in A.
+				 * @param b0 The first element in list B.
+				 * @param bCount The number of elements in B.
+				 * @param diag The cross diagonal of the merge matrix where the merge path is computed.
+				 * @param bUseSharedMem Whether to read from shared memory or global memory.
+				 * @return
+				 */
+				auto mergePath = writer.implementFunction< sdw::Int >( "c3d_mergePath"
+					, [&]( sdw::Int a0, sdw::Int aCount
+						, sdw::Int b0, sdw::Int bCount
+						, sdw::Int diag, sdw::Boolean bUseSharedMem )
+					{
+						auto begin = writer.declLocale( "begin", max( 0_i, diag - bCount ) );
+						auto end = writer.declLocale( "end", min( diag, aCount ) );
+
+						WHILE( writer, begin < end )
+						{
+							// Find the mid-point to start searching from.
+							auto mid = writer.declLocale( "mid", ( begin + end ) >> 1_i );
+							auto a = writer.declLocale( "a", writer.ternary( bUseSharedMem, gsKeys[a0 + mid], c3d_inputKeys[a0 + mid] ) );
+							auto b = writer.declLocale( "b", writer.ternary( bUseSharedMem, gsKeys[b0 + diag - 1 - mid], c3d_inputKeys[b0 + diag - 1 - mid] ) );
+
+							IF( writer, a < b )
+							{
+								begin = mid + 1_i;
+							}
+							ELSE
+							{
+								end = mid;
+							}
+							FI;
+						}
+						ELIHW;
+
+						writer.returnStmt( begin );
+					}
+					, sdw::InInt{ writer, "a0" }
+					, sdw::InInt{ writer, "aCount" }
+					, sdw::InInt{ writer, "b0" }
+					, sdw::InInt{ writer, "bCount" }
+					, sdw::InInt{ writer, "diag" }
+					, sdw::InBoolean{ writer, "bUseSharedMem" } );
+				/**
+				 * Perform a serial merge using shared memory. Write results to global memory.
+				 */
+				auto serialMerge = writer.implementFunction< sdw::Void >( "c3d_serialMerge"
+					, [&]( sdw::Int a0, sdw::Int a1
+						, sdw::Int b0, sdw::Int b1
+						, sdw::Int diag
+						, sdw::Int numValues, sdw::Int out0 )
+					{
+						auto aKey = writer.declLocale( "aKey", gsKeys[a0] );
+						auto bKey = writer.declLocale( "bKey", gsKeys[b0] );
+
+						auto aValue = writer.declLocale( "aValue", gsValues[a0] );
+						auto bValue = writer.declLocale( "bValue", gsValues[b0] );
+
+						FOR( writer, sdw::Int, i, 0_i, i < int( NumValuesPerThread ) && diag + i < numValues, ++i )
+						{
+							IF( writer, b0 >= b1 || ( a0 < a1 && aKey < bKey ) )
+							{
+								c3d_outputKeys[out0 + diag + i] = aKey;
+								c3d_outputValues[out0 + diag + i] = aValue;
+
+								++a0;
+
+								aKey = gsKeys[a0];
+								aValue = gsValues[a0];
+							}
+							ELSE
+							{
+								c3d_outputKeys[out0 + diag + i] = bKey;
+								c3d_outputValues[out0 + diag + i] = bValue;
+
+								++b0;
+
+								bKey = gsKeys[b0];
+								bValue = gsValues[b0];
+							}
+							FI;
+						}
+						ROF;
+					}
+					, sdw::InInt{ writer, "a0" }
+					, sdw::InInt{ writer, "a1" }
+					, sdw::InInt{ writer, "b0" }
+					, sdw::InInt{ writer, "b1" }
+					, sdw::InInt{ writer, "diag" }
+					, sdw::InInt{ writer, "numValues" }
+					, sdw::InInt{ writer, "out0" } );
+
+				writer.implementMainT< sdw::VoidT >( NumThreads
+					, [&]( sdw::ComputeIn in )
+					{
+						auto threadIndex = in.globalInvocationID.x();
+						auto chunkSize = c3d_chunkSize;
+						// Number of chunks to sort.
+						auto numChunks = writer.declLocale( "numChunks", writer.cast< sdw::UInt >( ceil( writer.cast< sdw::Float >( c3d_numElements ) / writer.cast< sdw::Float >( chunkSize ) ) ) );
+						// Num values to sort per sort group.
+						auto numValuesPerSortGroup = writer.declLocale( "numValuesPerSortGroup", min( chunkSize * 2_u, c3d_numElements ) );
+
+						if ( mergePathPartitions )
+						{
+							// Number of sort groups needed to sort all chunks.
+							auto numSortGroups = writer.declLocale( "numSortGroups", numChunks / 2_u );
+
+							// Total number of partitions per sort group.
+							auto numPartitionsPerSortGroup = writer.declLocale( "numPartitionsPerSortGroup", writer.cast< sdw::UInt >( ceil( writer.cast< sdw::Float >( numValuesPerSortGroup ) / float( NumValuesPerThreadGroup ) ) ) + 1_u );
+							// The sort group this thread is operating on.
+							auto sortGroup = writer.declLocale( "sortGroup", threadIndex / numPartitionsPerSortGroup );
+							// The partition this thread is computing within the sort group.
+							auto partitionInSortGroup = writer.declLocale( "partitionInSortGroup", threadIndex % numPartitionsPerSortGroup );
+
+							// The partition across all sort groups.
+							auto globalPartition = writer.declLocale( "globalPartition", ( sortGroup * numPartitionsPerSortGroup ) + partitionInSortGroup );
+							// Compute the maximum number of partitions to compute.
+							auto maxPartitions = writer.declLocale( "maxPartitions", numSortGroups * numPartitionsPerSortGroup );
+
+							IF( writer, globalPartition < maxPartitions )
+							{
+								auto a0 = writer.declLocale( "a0", writer.cast< sdw::Int >( sortGroup * numValuesPerSortGroup ) );
+								auto a1 = writer.declLocale( "a1", min( a0 + writer.cast< sdw::Int >( chunkSize ), writer.cast< sdw::Int >( c3d_numElements ) ) );
+								auto aCount = writer.declLocale( "aCount", a1 - a0 );
+								auto b0 = writer.declLocale( "b0", a1 );
+								auto b1 = writer.declLocale( "b1", min( b0 + writer.cast< sdw::Int >( chunkSize ), writer.cast< sdw::Int >( c3d_numElements ) ) );
+								auto bCount = writer.declLocale( "bCount", b1 - b0 );
+								// Number of values to sort in this sort group.
+								auto numValues = writer.declLocale( "numValues", aCount + bCount );
+								// The diagonal in the merge matrix of this sort group.
+								auto diag = writer.declLocale( "diag", min( writer.cast< sdw::Int >( partitionInSortGroup * NumValuesPerThreadGroup ), numValues ) );
+
+								// Find the merge path for this partition using global memory.
+								auto mergPath = writer.declLocale( "mergePath", mergePath( a0, aCount, b0, bCount, diag, 0_b ) );
+
+								// Write the merge path to global memory.
+								c3d_outputMergePathPartitions[globalPartition] = mergPath;
+							}
+							FI;
+						}
+						else
+						{
+							auto groupID = in.workGroupID.x();
+							auto groupIndex = in.localInvocationIndex;
+
+							// Number of sort groups needed to sort all chunks.
+							auto numSortGroups = writer.declLocale( "numSortGroups", max( numChunks / 2_u, 1_u ) );
+
+							// Compute the number of thread groups required to sort a single sort group.
+							auto numThreadGroupsPerSortGroup = writer.declLocale( "numThreadGroupsPerSortGroup", writer.cast< sdw::UInt >( ceil( writer.cast< sdw::Float >( numValuesPerSortGroup ) / float( NumValuesPerThreadGroup ) ) ) );
+							// The number of partitions per sort group.
+							// We add 1 to account for the merge path partition at the end of the sort group.
+							auto numPartitionsPerSortGroup = writer.declLocale( "numPartitionsPerSortGroup", numThreadGroupsPerSortGroup + 1_u );
+
+							// Compute the sort group that this thread is operating on.
+							auto sortGroup = writer.declLocale( "sortGroup", groupID / numThreadGroupsPerSortGroup );
+							// The merge path partition within the sort group.
+							auto partition = writer.declLocale( "partition", groupID % numThreadGroupsPerSortGroup );
+
+							auto globalPartition = writer.declLocale( "globalPartition", ( sortGroup * numPartitionsPerSortGroup ) + partition );
+
+							// Load the keys into shared memory based on the mergepath for this thread group.
+							auto mergePath0 = writer.declLocale( "mergePath0", c3d_inputMergePathPartitions[globalPartition] );
+							auto mergePath1 = writer.declLocale( "mergePath1", c3d_inputMergePathPartitions[globalPartition + 1_u] );
+							auto diag0 = writer.declLocale( "diag0", writer.cast< sdw::Int >( min( partition * NumValuesPerThreadGroup, numValuesPerSortGroup ) ) );
+							auto diag1 = writer.declLocale( "diag1", writer.cast< sdw::Int >( min( ( partition + 1_u ) * NumValuesPerThreadGroup, numValuesPerSortGroup ) ) );
+
+							// Compute the chunk ranges in the input set.
+							auto chunkOffsetA0 = writer.declLocale( "chunkOffsetA0", writer.cast< sdw::Int >( min( sortGroup * numValuesPerSortGroup, c3d_numElements ) ) );
+							auto chunkOffsetA1 = writer.declLocale( "chunkOffsetA1", min( chunkOffsetA0 + writer.cast< sdw::Int >( chunkSize ), writer.cast< sdw::Int >( c3d_numElements ) ) );
+							auto chunkSizeA = writer.declLocale( "chunkSizeA", chunkOffsetA1 - chunkOffsetA0 );
+
+							auto chunkOffsetB0 = writer.declLocale( "chunkOffsetB0", chunkOffsetA1 );
+							auto chunkOffsetB1 = writer.declLocale( "chunkOffsetB1", min( chunkOffsetB0 + writer.cast< sdw::Int >( chunkSize ), writer.cast< sdw::Int >( c3d_numElements ) ) );
+							auto chunkSizeB = writer.declLocale( "chunkSizeB", chunkOffsetB1 - chunkOffsetB0 );
+
+							// The total number of values to be sorted.
+							auto numValues = writer.declLocale( "numValues", chunkSizeA + chunkSizeB );
+
+							auto a0 = writer.declLocale( "a0", mergePath0 );
+							auto a1 = writer.declLocale( "a1", mergePath1 );
+							auto numA = writer.declLocale( "numA", min( a1 - a0, chunkSizeA ) );
+
+							auto b0 = writer.declLocale( "b0", diag0 - mergePath0 );
+							auto b1 = writer.declLocale( "b1", diag1 - mergePath1 );
+							auto numB = writer.declLocale( "numB", min( b1 - b0, chunkSizeB ) );
+
+							// Compute the diagonal for this thread within the threadgroup.
+							auto diag = writer.declLocale( "diag", writer.cast< sdw::Int >( groupIndex * NumValuesPerThread ) );
+
+							auto a = writer.declLocale( "a", 0_i );
+							auto b = writer.declLocale( "b", 0_i );
+							auto key = writer.declLocale( "key", 0_u );
+							auto value = writer.declLocale( "value", 0_u );
+
+							// Load the keys and values into shared memory.
+							for ( s32 i = 0; i < s32( NumValuesPerThread ); ++i )
+							{
+								a = a0 + diag + i;
+								b = b0 + ( a - a1 );
+
+								IF ( writer, a < a1 )
+								{
+									key = c3d_inputKeys[chunkOffsetA0 + a];
+									value = c3d_inputValues[chunkOffsetA0 + a];
+								}
+								ELSE
+								{
+									key = c3d_inputKeys[chunkOffsetB0 + b];
+									value = c3d_inputValues[chunkOffsetB0 + b];
+								}
+								FI;
+
+								gsKeys[diag + i] = key;
+								gsValues[diag + i] = value;
+							}
+
+							// Sync loading of keys/values in shared memory.
+							shader::groupMemoryBarrierWithGroupSync( writer );
+
+							// Compute the mergepath for this thread using shared memory.
+							auto mergPath = writer.declLocale( "mergePath", mergePath( 0_i, numA, numA, numB, diag, 1_b ) );
+
+							// Perform the serial merge using shared memory.
+							serialMerge( mergPath, numA, numA + diag - mergPath, numA + numB, diag0 + diag, numValues, chunkOffsetA0 );
+						}
+					} );
+				return std::make_unique< ast::Shader >( std::move( writer.getShader() ) );
+			}
+
+			class FramePass
+				: public crg::RunnablePass
+			{
+			public:
+				FramePass( crg::FramePass const & framePass
+					, crg::GraphContext & context
+					, crg::RunnableGraph & graph
+					, RenderDevice const & device
+					, FrustumClusters & clusters
+					, LightData lightData )
+					: crg::RunnablePass{ framePass
+						, context
+						, graph
+						, { [this]( uint32_t index ){ doInitialise( index ); }
+							, GetPipelineStateCallback( [](){ return crg::getPipelineState( VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); } )
+							, [this]( crg::RecordContext & recContext, VkCommandBuffer cb, uint32_t i ){ doRecordInto( recContext, cb, i ); }
+							, crg::getDefaultV< GetPassIndexCallback >()
+							, IsEnabledCallback( [this](){ return doIsEnabled(); } )
+							, IsComputePassCallback( [](){ return true; } ) }
+						, crg::ru::Config{ 1u, true /* resettable */ } }
+					, m_clusters{ clusters }
+					, m_lightCache{ clusters.getCamera().getScene()->getLightCache() }
+					, m_lightData{ std::move( lightData ) }
+					, m_partitions{ framePass, context, graph, device, true, this }
+					, m_merge{ framePass, context, graph, device, false, this }
+				{
+				}
+
+				CRG_API void resetPipeline( crg::VkPipelineShaderStageCreateInfoArray config
+					, uint32_t index )
+				{
+					resetCommandBuffer( index );
+					m_partitions.pipeline.resetPipeline( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( m_partitions.createInfo ), index );
+					m_merge.pipeline.resetPipeline( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( m_merge.createInfo ), index );
+					doCreatePipeline( index, m_partitions );
+					doCreatePipeline( index, m_merge );
+					reRecordCurrent();
+				}
+
+			private:
+				struct Pipeline
+				{
+					ShaderModule shader;
+					ashes::PipelineShaderStageCreateInfoArray createInfo;
+					crg::cp::ConfigData cpConfig;
+					crg::PipelineHolder pipeline;
+
+					Pipeline( crg::FramePass const & framePass
+						, crg::GraphContext & context
+						, crg::RunnableGraph & graph
+						, RenderDevice const & device
+						, bool mergePathPartitions
+						, FramePass * parent )
+						: shader{ VK_SHADER_STAGE_COMPUTE_BIT, mergePathPartitions ? std::string{ "MergePathPartitions" } : std::string{ "MergeSort" }, createShader( mergePathPartitions ) }
+						, createInfo{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, shader ) } }
+						, cpConfig{ crg::getDefaultV< InitialiseCallback >()
+							, nullptr
+							, IsEnabledCallback( [parent](){ return parent->doIsEnabled();} )
+							, crg::getDefaultV< GetPassIndexCallback >()
+							, crg::getDefaultV< RecordCallback >()
+							, crg::getDefaultV< RecordCallback >()
+							, 1u
+							, 1u
+							, 1u }
+						, pipeline{ framePass
+							, context
+							, graph
+							, crg::pp::Config{}
+								.program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( createInfo ) )
+								.pushConstants( VkPushConstantRange{ VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u } )
+							, VK_PIPELINE_BIND_POINT_COMPUTE
+							, 2u }
+					{
+					}
+				};
+
+			private:
+				FrustumClusters & m_clusters;
+				LightCache const & m_lightCache;
+				LightData m_lightData;
+				Pipeline m_partitions;
+				Pipeline m_merge;
+
+			private:
+				void doInitialise( uint32_t index )
+				{
+					m_partitions.pipeline.initialise();
+					m_merge.pipeline.initialise();
+					doCreatePipeline( index, m_partitions );
+					doCreatePipeline( index, m_merge );
+				}
+
+				bool doIsEnabled()const
+				{
+					return m_clusters.needsLightsUpdate()
+						&& m_lightCache.getLightsCount( m_lightData.lightType ) > 0;
+				}
+
+				void doRecordInto( crg::RecordContext & context
+					, VkCommandBuffer commandBuffer
+					, uint32_t index )
+				{
+					auto totalValues = m_lightCache.getLightsCount( m_lightData.lightType );
+					auto chunkSize = NumThreadsPerThreadGroup;
+
+					// The total number of complete chunks to sort.
+					auto numChunks = getLightsMortonCodeChunkCount( totalValues );
+					DispatchData data{ totalValues, 0u };
+
+					while ( numChunks > 1u )
+					{
+						data.chunkSize = chunkSize;
+
+						// Number of sort groups required to sort all chunks.
+						// Each sort group merge sorts 2 chunks into a single chunk.
+						auto numSortGroups = numChunks / 2u;
+
+						// Compute merge path partitions per thread group.
+						{
+							m_partitions.pipeline.recordInto( context, commandBuffer, index );
+
+							// The number of thread groups that are required per sort group.
+							auto numThreadGroupsPerSortGroup = uint32_t( std::ceil( float( chunkSize * 2u ) / float( NumValuesPerThreadGroup ) ) );
+
+							// The number of merge path partitions that need to be computed.
+							auto numMergePathPartitionsPerSortGroup = numThreadGroupsPerSortGroup + 1u;
+							auto totalMergePathPartitions = numMergePathPartitionsPerSortGroup * numSortGroups;
+
+							// The number of thread groups needed to compute all merge path partitions.
+							auto numThreadGroups = uint32_t( std::ceil( float( totalMergePathPartitions ) / float( NumThreadsPerThreadGroup ) ) );
+
+							m_context.vkCmdPushConstants( commandBuffer, m_partitions.pipeline.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u, &data );
+							m_context.vkCmdDispatch( commandBuffer, numThreadGroups, 1u, 1u );
+
+							// Add an explicit barrier for MergePathPartitions.
+							// This is required since the MergePathPartitions structured buffer is being used
+							// as a UAV in the MergePathPartions compute shader and as an SRV in the MergeSort
+							// compute shader. Because the MergePathPartions argument is not rebound between
+							// dispatches, no implicit UAV barrier will be added to the command list and MergeSort
+							// will likely not see the correct merge path partitions.
+							// To resolve this, an explicit UAV barrier is added for the resource.
+							doBarriers( context, commandBuffer, index );
+						}
+
+						// Perform merge sort using merge path partitions computed from the previous step.
+						{
+							m_merge.pipeline.recordInto( context, commandBuffer, index );
+
+							// The number of values that each sort group will sort.
+							// Each sort group merges 2 chunks into 1.
+							auto numValuesPerSortGroup = std::min( chunkSize * 2u, totalValues );
+
+							// The number of thread groups required to sort all values.
+							auto numThreadGroupsPerSortGroup = uint32_t( std::ceil( float( numValuesPerSortGroup ) / float( NumValuesPerThreadGroup ) ) );
+
+							m_context.vkCmdPushConstants( commandBuffer, m_merge.pipeline.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u, &data );
+							m_context.vkCmdDispatch( commandBuffer, numThreadGroupsPerSortGroup * numSortGroups, 1u, 1u );
+						}
+
+						// Ping-pong the buffers
+						index = 1u - index;
+
+						chunkSize *= 2;
+						numChunks = uint32_t( std::ceil( float( totalValues ) / float( chunkSize ) ) );
+					}
+				}
+
+				void doBarriers( crg::RecordContext & context
+					, VkCommandBuffer commandBuffer
+					, uint32_t passIndex )
+				{
+					for ( auto & attach : m_pass.buffers )
+					{
+						auto buffer = attach.buffer;
+
+						if ( !attach.isNoTransition()
+							&& attach.isStorageBuffer()
+							&& attach.isClearableBuffer() )
+						{
+							auto currentState = context.getAccessState( buffer.buffer.buffer( passIndex )
+								, buffer.range );
+							context.memoryBarrier( commandBuffer
+								, buffer.buffer.buffer( passIndex )
+								, buffer.range
+								, currentState.access
+								, currentState.pipelineStage
+								, crg::AccessState{ VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT }
+								, true );
+						}
+					}
+				}
+
+				void doCreatePipeline( uint32_t index
+					, Pipeline & pipeline )
+				{
+					auto & program = pipeline.pipeline.getProgram( index );
+					VkComputePipelineCreateInfo createInfo{ VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO
+						, nullptr
+						, 0u
+						, program.front()
+						, pipeline.pipeline.getPipelineLayout()
+						, VkPipeline{}
+						, 0u };
+					pipeline.pipeline.createPipeline( index, createInfo );
+				}
+			};
+		}
+
+		//*********************************************************************************************
+
+		static crg::FramePass const & createSortLightMortonCodesPasses( crg::FramePassGroup & graph
+			, crg::FramePass const * previousPass
+			, RenderDevice const & device
+			, CameraUbo const & cameraUbo
+			, FrustumClusters & clusters
+			, LightData lightData )
+		{
+			// Create Radix sort pass
+			auto & radix = graph.createPass( "RadixSort" + lightData.name
+				, [&clusters, &device, lightData]( crg::FramePass const & framePass
+					, crg::GraphContext & context
+					, crg::RunnableGraph & graph )
+				{
+					auto result = std::make_unique< radix::FramePass >( framePass
+						, context
+						, graph
+						, device
+						, clusters
+						, lightData );
+					device.renderSystem.getEngine()->registerTimer( framePass.getFullName()
+						, result->getTimer() );
+					return result;
+				} );
+			radix.addDependency( *previousPass );
+			createInputStoragePassBinding( radix, uint32_t( radix::eInputKeys ), "C3D_In" + lightData.name + "LightMortonCodes", lightData.inMortonCodes( clusters ), 0u, ashes::WholeSize );
+			createInputStoragePassBinding( radix, uint32_t( radix::eInputValues ), "C3D_In" + lightData.name + "LightIndices", lightData.inIndices( clusters ), 0u, ashes::WholeSize );
+			createClearableOutputStorageBinding( radix, uint32_t( radix::eOutputKeys ), "C3D_Out" + lightData.name + "LightMortonCodes", lightData.outMortonCodes( clusters ), 0u, ashes::WholeSize );
+			createClearableOutputStorageBinding( radix, uint32_t( radix::eOutputValues ), "C3D_Out" + lightData.name + "LightIndices", lightData.outIndices( clusters ), 0u, ashes::WholeSize );
+
+			// Create Merge sort pass
+			auto & mergeSort = graph.createPass( "MergeSort" + lightData.name
+				, [&clusters, &device, lightData]( crg::FramePass const & framePass
+					, crg::GraphContext & context
+					, crg::RunnableGraph & graph )
+				{
+					auto result = std::make_unique< merge::FramePass >( framePass
+						, context
+						, graph
+						, device
+						, clusters
+						, lightData );
+					device.renderSystem.getEngine()->registerTimer( framePass.getFullName()
+						, result->getTimer() );
+					return result;
+				} );
+			mergeSort.addDependency( radix );
+			createInOutStoragePassBinding( mergeSort, uint32_t( merge::eInputKeys ), "C3D_In" + lightData.name + "MortonCodes", { &lightData.outMortonCodes( clusters ), &lightData.inMortonCodes( clusters ) }, 0u, ashes::WholeSize );
+			createInOutStoragePassBinding( mergeSort, uint32_t( merge::eInputValues ), "C3D_In" + lightData.name + "LightIndices", { &lightData.outIndices( clusters ), &lightData.inIndices( clusters ) }, 0u, ashes::WholeSize );
+			createInputStoragePassBinding( mergeSort, uint32_t( merge::eInputMergePathPartitions ), "C3D_In" + lightData.name + "MergePathPartitions", clusters.getMergePathPartitionsBuffer(), 0u, ashes::WholeSize );
+			createInOutStoragePassBinding( mergeSort, uint32_t( merge::eOutputKeys ), "C3D_Out" + lightData.name + "MortonCodes", { &lightData.inMortonCodes( clusters ), &lightData.outMortonCodes( clusters ) }, 0u, ashes::WholeSize );
+			createInOutStoragePassBinding( mergeSort, uint32_t( merge::eOutputValues ), "C3D_Out" + lightData.name + "LightIndices", { &lightData.inIndices( clusters ), &lightData.outIndices( clusters ) }, 0u, ashes::WholeSize );
+			createClearableOutputStorageBinding( mergeSort, uint32_t( merge::eOutputMergePathPartitions ), "C3D_Out" + lightData.name + "MergePathPartitions", clusters.getMergePathPartitionsBuffer(), 0u, ashes::WholeSize );
+
+			return mergeSort;
+		}
+	}
+
+	//*********************************************************************************************
+
+	u32 getLightsMortonCodeChunkCount( u32 lightCount )
+	{
+		auto chunkSize = srtmrt::NumThreadsPerThreadGroup;
+
+		// The total number of complete chunks to sort.
+		auto numChunks = uint32_t( std::ceil( float( lightCount ) / float( chunkSize ) ) );
+
+		return numChunks;
+	}
+
+	crg::FramePassArray createSortLightsMortonCodePass( crg::FramePassGroup & graph
+		, crg::FramePass const * previousPass
+		, RenderDevice const & device
+		, CameraUbo const & cameraUbo
+		, FrustumClusters & clusters )
+	{
+		// Radix sort
+		auto & point = srtmrt::createSortLightMortonCodesPasses( graph
+			, previousPass
+			, device
+			, cameraUbo
+			, clusters
+			, { LightType::ePoint
+				, "Point"
+				, &FrustumClusters::getInputPointLightMortonCodesBuffer
+				, & FrustumClusters::getOutputPointLightMortonCodesBuffer
+				, &FrustumClusters::getInputPointLightIndicesBuffer
+				, &FrustumClusters::getOutputPointLightIndicesBuffer } );
+
+		// Merge sort
+		auto & spot =  srtmrt::createSortLightMortonCodesPasses( graph
+			, previousPass
+			, device
+			, cameraUbo
+			, clusters
+			, { LightType::eSpot
+				, "Spot"
+				, &FrustumClusters::getInputSpotLightMortonCodesBuffer
+				, & FrustumClusters::getOutputSpotLightMortonCodesBuffer
+				, &FrustumClusters::getInputSpotLightIndicesBuffer
+				, &FrustumClusters::getOutputSpotLightIndicesBuffer } );
+
+		return { &point, &spot };
+	}
+
+	//*********************************************************************************************
+}
diff --git a/source/Core/Castor3D/Render/GlobalIllumination/VoxelConeTracing/Voxelizer.cpp b/source/Core/Castor3D/Render/GlobalIllumination/VoxelConeTracing/Voxelizer.cpp
index 3bdce89ee6..fec3661527 100644
--- a/source/Core/Castor3D/Render/GlobalIllumination/VoxelConeTracing/Voxelizer.cpp
+++ b/source/Core/Castor3D/Render/GlobalIllumination/VoxelConeTracing/Voxelizer.cpp
@@ -102,7 +102,7 @@ namespace castor3d
 				for ( auto & attach : m_pass.buffers )
 				{
 					m_context.vkCmdFillBuffer( commandBuffer
-						, attach.buffer.buffer.buffer
+						, attach.buffer.buffer.buffer( index )
 						, attach.buffer.range.offset
 						, attach.buffer.range.size
 						, 0u );
diff --git a/source/Core/Castor3D/Render/Opaque/Lighting/ClusteredLightsPipeline.cpp b/source/Core/Castor3D/Render/Opaque/Lighting/ClusteredLightsPipeline.cpp
index 85ba7e0897..c0e509044c 100644
--- a/source/Core/Castor3D/Render/Opaque/Lighting/ClusteredLightsPipeline.cpp
+++ b/source/Core/Castor3D/Render/Opaque/Lighting/ClusteredLightsPipeline.cpp
@@ -168,7 +168,7 @@ namespace castor3d
 			if ( attach.view.size() == 1u )
 			{
 				auto view = attach.view.front();
-				context.setLayoutState( view
+				context.setLayoutState( resolveView( view, passIndex )
 					, { attach.output
 					, crg::getAccessMask( attach.output )
 					, crg::getStageMask( attach.output ) } );
diff --git a/source/Core/Castor3D/Render/Opaque/Lighting/LightsPipeline.cpp b/source/Core/Castor3D/Render/Opaque/Lighting/LightsPipeline.cpp
index f8f7bd4ed3..7304bd6df3 100644
--- a/source/Core/Castor3D/Render/Opaque/Lighting/LightsPipeline.cpp
+++ b/source/Core/Castor3D/Render/Opaque/Lighting/LightsPipeline.cpp
@@ -445,7 +445,7 @@ namespace castor3d
 			if ( attach.view.size() == 1u )
 			{
 				auto view = attach.view.front();
-				context.setLayoutState( view
+				context.setLayoutState( resolveView( view, passIndex )
 					, { attach.output
 					, crg::getAccessMask( attach.output )
 					, crg::getStageMask( attach.output ) } );
diff --git a/source/Core/Castor3D/Render/Opaque/VisibilityReorderPass.cpp b/source/Core/Castor3D/Render/Opaque/VisibilityReorderPass.cpp
index 3c4330ceae..7376c0fc73 100644
--- a/source/Core/Castor3D/Render/Opaque/VisibilityReorderPass.cpp
+++ b/source/Core/Castor3D/Render/Opaque/VisibilityReorderPass.cpp
@@ -354,7 +354,7 @@ namespace castor3d
 			{
 				auto buffer = attach.buffer.buffer;
 				m_context.vkCmdFillBuffer( commandBuffer
-					, buffer.buffer
+					, buffer.buffer( index )
 					, 0u
 					, ashes::WholeSize
 					, 0u );
diff --git a/source/Core/Castor3D/Render/RenderTarget.cpp b/source/Core/Castor3D/Render/RenderTarget.cpp
index 0eb1c743c4..12b7128741 100644
--- a/source/Core/Castor3D/Render/RenderTarget.cpp
+++ b/source/Core/Castor3D/Render/RenderTarget.cpp
@@ -565,6 +565,7 @@ namespace castor3d
 		doCleanupCombineProgram();
 		m_culler.reset();
 		m_hdrConfigUbo.reset();
+		m_frustumClusters.reset();
 	}
 
 	void RenderTarget::update( CpuUpdater & updater )
diff --git a/source/Core/Castor3D/Scene/Light/SpotLight.cpp b/source/Core/Castor3D/Scene/Light/SpotLight.cpp
index 77455aff25..7f339edc35 100644
--- a/source/Core/Castor3D/Scene/Light/SpotLight.cpp
+++ b/source/Core/Castor3D/Scene/Light/SpotLight.cpp
@@ -26,7 +26,7 @@ namespace castor3d
 			castor::Point3f min{ points[0] };
 			castor::Point3f max{ points[0] };
 
-			for ( auto & cur : castor::makeArrayView( &points[1], points.size() - 1u ) )
+			for ( auto & cur : castor::makeArrayView( &points[1], uint64_t( points.size() - 1u ) ) )
 			{
 				max[0] = std::max( cur[0], max[0] );
 				max[1] = std::max( cur[1], max[1] );
diff --git a/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp b/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp
index a5cd31fffd..41c5d1c60c 100644
--- a/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp
+++ b/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp
@@ -82,7 +82,6 @@ namespace castor
 		Logger::logError( description );
 	}
 
-	[[ noreturn ]]
 	void cuFailure( char const * const description )
 	{
 		std::stringstream stream;
diff --git a/tools/CastorTestLauncher/MainFrame.cpp b/tools/CastorTestLauncher/MainFrame.cpp
index 678ac7353f..7dfe3688bc 100644
--- a/tools/CastorTestLauncher/MainFrame.cpp
+++ b/tools/CastorTestLauncher/MainFrame.cpp
@@ -262,7 +262,7 @@ namespace test_launcher
 		, m_engine{ engine }
 		, m_maxFrameCount{ maxFrameCount }
 	{
-		SetClientSize( FromDIP( wxSize{ 800, 600 } ) );
+		SetClientSize( wxSize{ 800, 600 } );
 	}
 
 	bool MainFrame::initialise()
diff --git a/tools/CastorViewer/MainFrame.cpp b/tools/CastorViewer/MainFrame.cpp
index 37764e21a9..00bc0c9923 100644
--- a/tools/CastorViewer/MainFrame.cpp
+++ b/tools/CastorViewer/MainFrame.cpp
@@ -265,7 +265,11 @@ namespace CastorViewer
 		SetBackgroundColour( GuiCommon::PANEL_BACKGROUND_COLOUR );
 		SetForegroundColour( GuiCommon::PANEL_FOREGROUND_COLOUR );
 
-		auto size = FromDIP( wxSize{ 800 + m_propertiesWidth, 600 + m_logsHeight } );
+#if wxCHECK_VERSION( 3, 1, 0 )
+		auto size = this->FromDIP( wxSize{ 800 + m_propertiesWidth, 600 + m_logsHeight } );
+#else
+		auto size = wxSize{ 800 + m_propertiesWidth, 600 + m_logsHeight };
+#endif
 		SetClientSize( size );
 #if wxCHECK_VERSION( 2, 9, 0 )
 		SetMinClientSize( size );
@@ -690,14 +694,18 @@ namespace CastorViewer
 			return;
 		}
 
-		auto size = FromDIP( GuiCommon::make_wxSize( target->getSize() ) );
+#if wxCHECK_VERSION( 3, 1, 0 )
+		auto size = this->FromDIP( GuiCommon::make_wxSize( target->getSize() ) );
+#else
+		auto size = GuiCommon::make_wxSize( target->getSize() );
+#endif
 
 		if ( IsMaximized() )
 		{
 			Maximize( false );
 		}
 
-		SetPosition( FromDIP( wxPoint{} ) );
+		SetPosition( wxPoint{} );
 		SetClientSize( size );
 #if wxCHECK_VERSION( 2, 9, 0 )
 		SetMinClientSize( size );
diff --git a/vcpkg.json b/vcpkg.json
index 9868ae42ec..30fe633775 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -2,7 +2,7 @@
   "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg/master/scripts/vcpkg.schema.json",
   "name": "castor3d",
   "version": "0.15.0",
-  "builtin-baseline": "6accd15d644e93cec849ea346a147828437928b3",
+  "builtin-baseline": "1c5a340f6e10985e2d92af174a68dbd15c1fa4e1",
   "dependencies": [
     "convectionkernels",
     "freetype",
@@ -15,7 +15,6 @@
     "assimp",
     "freeimage",
     "glsl",
-    "tools",
     "vkfft"
   ],
   "vcpkg-configuration": {