From 55a66d286860ea04c4db86d4f395e04fd84f90a5 Mon Sep 17 00:00:00 2001 From: Sylvain Doremus Date: Sat, 27 May 2023 02:15:15 +0200 Subject: [PATCH] Clustered: Added lights BVH generation. --- external/RenderGraph | 2 +- .../Clustered/AssignLightsToClusters.hpp | 3 +- .../Render/Clustered/BuildLightsBVH.hpp | 18 + .../Render/Clustered/ClusteredModule.hpp | 6 + .../Render/Clustered/ComputeClustersAABB.hpp | 2 +- .../Clustered/ComputeLightsMortonCode.hpp | 18 + .../Render/Clustered/FrustumClusters.hpp | 169 +++++- .../Render/Clustered/ReduceLightsAABB.hpp | 18 + include/Core/Castor3D/Render/RenderTarget.hpp | 2 +- .../Core/Castor3D/Render/RenderTechnique.hpp | 3 +- .../Shader/Shaders/GlslClusteredLights.hpp | 153 +++-- .../Castor3D/Shader/Shaders/SdwModule.hpp | 1 + .../Core/Castor3D/Shader/Ubos/CameraUbo.hpp | 9 +- .../Core/Castor3D/Shader/Ubos/ClustersUbo.hpp | 20 +- .../Core/Castor3D/Shader/Ubos/UbosModule.hpp | 8 + include/Core/CastorUtils/Config/SmartPtr.hpp | 12 + .../Core/CastorUtils/Design/DataHolder.hpp | 2 +- .../Core/CastorUtils/Design/DesignModule.hpp | 2 +- .../Core/CastorUtils/Exception/Assertion.hpp | 8 +- source/Core/Castor3D/CMakeLists.txt | 6 + source/Core/Castor3D/DebugDefines.hpp | 2 + .../Castor3D/Event/Frame/CpuFrameEvent.cpp | 12 +- .../Clustered/AssignLightsToClusters.cpp | 448 +++++++++++---- .../Render/Clustered/BuildLightsBVH.cpp | 527 ++++++++++++++++++ .../Render/Clustered/ComputeClustersAABB.cpp | 7 +- .../Clustered/ComputeLightsMortonCode.cpp | 263 +++++++++ .../Render/Clustered/FrustumClusters.cpp | 213 +++++-- .../Render/Clustered/ReduceLightsAABB.cpp | 408 ++++++++++++++ .../Render/Opaque/DeferredRendering.cpp | 2 + .../Core/Castor3D/Render/RenderNodesPass.cpp | 8 +- .../Core/Castor3D/Render/RenderTechnique.cpp | 23 +- .../Transparent/TransparentRendering.cpp | 2 + source/Core/Castor3D/Shader/ShaderModule.cpp | 8 + .../Shader/Shaders/GlslClusteredLights.cpp | 28 +- .../Core/Castor3D/Shader/Ubos/ClustersUbo.cpp | 9 +- .../CastorUtils/CastorUtilsPrerequisites.cpp | 2 +- 36 files changed, 2183 insertions(+), 241 deletions(-) create mode 100644 include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp create mode 100644 include/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp create mode 100644 include/Core/Castor3D/Render/Clustered/ReduceLightsAABB.hpp create mode 100644 source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp create mode 100644 source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp create mode 100644 source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp diff --git a/external/RenderGraph b/external/RenderGraph index c5bdda7933..31396b687f 160000 --- a/external/RenderGraph +++ b/external/RenderGraph @@ -1 +1 @@ -Subproject commit c5bdda793361901762d273b76f5a2355f1ab5411 +Subproject commit 31396b687fc090e9fc9a1840f1dc3593a6ccf31c diff --git a/include/Core/Castor3D/Render/Clustered/AssignLightsToClusters.hpp b/include/Core/Castor3D/Render/Clustered/AssignLightsToClusters.hpp index 0fb497fc7e..3a01df987c 100644 --- a/include/Core/Castor3D/Render/Clustered/AssignLightsToClusters.hpp +++ b/include/Core/Castor3D/Render/Clustered/AssignLightsToClusters.hpp @@ -11,9 +11,8 @@ namespace castor3d C3D_API crg::FramePass const & createAssignLightsToClustersPass( crg::FramePassGroup & graph , crg::FramePass const * previousPass , RenderDevice const & device - , LightCache const & lights , CameraUbo const & cameraUbo - , FrustumClusters const & clusters ); + , FrustumClusters & clusters ); } #endif diff --git a/include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp b/include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp new file mode 100644 index 0000000000..3d8112b5af --- /dev/null +++ b/include/Core/Castor3D/Render/Clustered/BuildLightsBVH.hpp @@ -0,0 +1,18 @@ +/* +See LICENSE file in root folder +*/ +#ifndef ___C3D_BuildLightsBVH_H___ +#define ___C3D_BuildLightsBVH_H___ + +#include "ClusteredModule.hpp" + +namespace castor3d +{ + C3D_API crg::FramePass const & createBuildLightsBVHPass( crg::FramePassGroup & graph + , crg::FramePass const * previousPass + , RenderDevice const & device + , CameraUbo const & cameraUbo + , FrustumClusters & clusters ); +} + +#endif diff --git a/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp b/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp index 9bbd82611d..8d4bda12c1 100644 --- a/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp +++ b/include/Core/Castor3D/Render/Clustered/ClusteredModule.hpp @@ -6,6 +6,8 @@ See LICENSE file in root folder #include "Castor3D/Render/RenderModule.hpp" +#include + namespace castor3d { /**@name Render */ @@ -34,6 +36,10 @@ namespace castor3d CU_DeclareSmartPtr( castor3d, FrustumClusters, C3D_API ); + using ClustersBuffersChangedFunction = std::function< void( FrustumClusters const & ) >; + using OnClustersBuffersChanged = castor::SignalT< ClustersBuffersChangedFunction >; + using OnClustersBuffersChangedConnection = castor::ConnectionT< ClustersBuffersChangedFunction >; + //@} //@} } diff --git a/include/Core/Castor3D/Render/Clustered/ComputeClustersAABB.hpp b/include/Core/Castor3D/Render/Clustered/ComputeClustersAABB.hpp index da9facca08..6953e87025 100644 --- a/include/Core/Castor3D/Render/Clustered/ComputeClustersAABB.hpp +++ b/include/Core/Castor3D/Render/Clustered/ComputeClustersAABB.hpp @@ -12,7 +12,7 @@ namespace castor3d , crg::FramePass const * previousPass , RenderDevice const & device , CameraUbo const & cameraUbo - , FrustumClusters const & clusters ); + , FrustumClusters & clusters ); } #endif diff --git a/include/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp b/include/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp new file mode 100644 index 0000000000..909d808862 --- /dev/null +++ b/include/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp @@ -0,0 +1,18 @@ +/* +See LICENSE file in root folder +*/ +#ifndef ___C3D_ComputeLightsMortonCode_H___ +#define ___C3D_ComputeLightsMortonCode_H___ + +#include "ClusteredModule.hpp" + +namespace castor3d +{ + C3D_API crg::FramePass const & createComputeLightsMortonCodePass( crg::FramePassGroup & graph + , crg::FramePass const * previousPass + , RenderDevice const & device + , CameraUbo const & cameraUbo + , FrustumClusters & clusters ); +} + +#endif diff --git a/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp b/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp index 8adde56b28..9d6fa718ab 100644 --- a/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp +++ b/include/Core/Castor3D/Render/Clustered/FrustumClusters.hpp @@ -11,6 +11,7 @@ See LICENSE file in root folder #include "Castor3D/Shader/Ubos/ClustersUbo.hpp" #include +#include #include @@ -34,6 +35,42 @@ namespace castor3d *\param[in, out] updater Les données d'update. */ C3D_API void update( CpuUpdater & updater ); + /** + *\~english + *\brief Registers the clusters related frame passes. + *\~french + *\brief Enregistre les passes liées aux clusters. + */ + C3D_API crg::FramePass const & createFramePasses( crg::FramePassGroup & graph + , crg::FramePass const * previousPass + , CameraUbo const & cameraUbo ); + /** + *\~english + *\brief Compute the number of nodes for given BVH level. + *\param[in] numLeaves The number of leaf nodes. + *\~french + *\brief Calcule le nombre de noeuds du niveau donné d'un BVH. + *\param[in] numLeaves Le nombre de noeuds feuilles. + */ + C3D_API static uint32_t getNumLevelNodes( uint32_t level ); + /** + *\~english + *\brief Compute the number of levels needed for a BVH that consists of a number of leaf nodes. + *\param[in] numLeaves The number of leaf nodes. + *\~french + *\brief Calcule le nombre de niveaux nécessaire pour un BVH contenant le nombre de noeuds feuille donné. + *\param[in] numLeaves Le nombre de noeuds feuilles. + */ + C3D_API static uint32_t getNumLevels( uint32_t numLeaves ); + /** + *\~english + *\brief Compute the number of (child) nodes needed to represent a BVH that consists of a number of leaf nodes. + *\param[in] numLeaves The number of leaf nodes. + *\~french + *\brief Calcule le nombre de noeuds (enfants) nécessaire pour un représenter un BVH contenant le nombre de noeuds feuille donné. + *\param[in] numLeaves Le nombre de noeuds feuilles. + */ + C3D_API static uint32_t getNumNodes( uint32_t numLeaves ); castor::Point3ui const & getDimensions()const noexcept { @@ -55,31 +92,126 @@ namespace castor3d return m_clustersUbo; } - auto & getAabbBuffer()const noexcept + auto & getClustersAABBBuffer()const noexcept { + CU_Require( m_aabbBuffer ); return *m_aabbBuffer; } - auto & getPointLightIndexBuffer()const noexcept + auto & getPointLightClusterGridBuffer()const noexcept + { + CU_Require( m_pointLightClusterGridBuffer ); + return *m_pointLightClusterGridBuffer; + } + + auto & getSpotLightClusterGridBuffer()const noexcept + { + CU_Require( m_spotLightClusterGridBuffer ); + return *m_spotLightClusterGridBuffer; + } + + auto & getPointLightClusterIndexBuffer()const noexcept + { + CU_Require( m_pointLightClusterIndexBuffer ); + return *m_pointLightClusterIndexBuffer; + } + + auto & getSpotLightClusterIndexBuffer()const noexcept + { + CU_Require( m_spotLightClusterIndexBuffer ); + return *m_spotLightClusterIndexBuffer; + } + + auto & getLightsAABBBuffer()const noexcept + { + CU_Require( m_lightsAABBBuffer ); + return *m_lightsAABBBuffer; + } + + auto & getPointLightBVHBuffer()const noexcept + { + CU_Require( m_pointBVHBuffer ); + return *m_pointBVHBuffer; + } + + auto & getSpotLightBVHBuffer()const noexcept { - return *m_pointIndexBuffer; + CU_Require( m_spotBVHBuffer ); + return *m_spotBVHBuffer; } - auto & getPointLightClusterBuffer()const noexcept + void swapLightMortonIndicesIO() { - return *m_pointClusterBuffer; + m_lightMortonIndicesInput = 1u - m_lightMortonIndicesInput; } - auto & getSpotLightIndexBuffer()const noexcept + ashes::BufferBase & getPointLightIndicesBuffer( uint32_t index )const noexcept { - return *m_spotIndexBuffer; + return *m_pointIndicesBuffers[index]; } - auto & getSpotLightClusterBuffer()const noexcept + ashes::BufferBase & getSpotLightIndicesBuffer( uint32_t index )const noexcept { - return *m_spotClusterBuffer; + return *m_spotIndicesBuffers[index]; } + ashes::BufferBase & getPointLightMortonCodesBuffer( uint32_t index )const noexcept + { + return *m_pointMortonCodesBuffers[index]; + } + + ashes::BufferBase & getSpotLightMortonCodesBuffer( uint32_t index )const noexcept + { + return *m_spotMortonCodesBuffers[index]; + } + + ashes::BufferBase & getInputPointLightIndicesBuffer()const noexcept + { + return getPointLightIndicesBuffer( m_lightMortonIndicesInput ); + } + + ashes::BufferBase & getInputSpotLightIndicesBuffer()const noexcept + { + return getSpotLightIndicesBuffer( m_lightMortonIndicesInput ); + } + + ashes::BufferBase & getOutputPointLightIndicesBuffer()const noexcept + { + return getPointLightIndicesBuffer( 1u - m_lightMortonIndicesInput ); + } + + ashes::BufferBase & getOutputSpotLightIndicesBuffer()const noexcept + { + return getSpotLightIndicesBuffer( 1u - m_lightMortonIndicesInput ); + } + + ashes::BufferBase & getInputPointLightMortonCodesBuffer()const noexcept + { + return getPointLightMortonCodesBuffer( m_lightMortonIndicesInput ); + } + + ashes::BufferBase & getInputSpotLightMortonCodesBuffer()const noexcept + { + return getSpotLightMortonCodesBuffer( m_lightMortonIndicesInput ); + } + + ashes::BufferBase & getOutputPointLightMortonCodesBuffer()const noexcept + { + return getPointLightMortonCodesBuffer( 1u - m_lightMortonIndicesInput ); + } + + ashes::BufferBase & getOutputSpotLightMortonCodesBuffer()const noexcept + { + return getSpotLightMortonCodesBuffer( 1u - m_lightMortonIndicesInput ); + } + + auto & getCamera()const noexcept + { + return m_camera; + } + + OnClustersBuffersChanged onClusterBuffersChanged; + private: struct AABB { @@ -97,17 +229,26 @@ namespace castor3d bool m_clustersDirty{ true }; bool m_lightsDirty{ true }; bool m_first{ true }; + uint32_t m_lightMortonIndicesInput{ 1u }; castor::GroupChangeTracked< castor::Point3ui > m_dimensions; castor::GroupChangeTracked< uint32_t > m_clusterSize; castor::GroupChangeTracked< castor::Matrix4x4f > m_cameraProjection; castor::GroupChangeTracked< castor::Matrix4x4f > m_cameraView; castor::GroupChangeTracked< float > m_nearK; ClustersUbo m_clustersUbo; - ashes::BufferPtr< AABB > m_aabbBuffer; - ashes::BufferPtr< u32 > m_pointIndexBuffer; - ashes::BufferPtr< castor::Point2ui > m_pointClusterBuffer; - ashes::BufferPtr< u32 > m_spotIndexBuffer; - ashes::BufferPtr< castor::Point2ui > m_spotClusterBuffer; + ashes::BufferBasePtr m_aabbBuffer; + ashes::BufferBasePtr m_pointLightClusterGridBuffer; + ashes::BufferBasePtr m_spotLightClusterGridBuffer; + ashes::BufferBasePtr m_pointLightClusterIndexBuffer; + ashes::BufferBasePtr m_spotLightClusterIndexBuffer; + ashes::BufferBasePtr m_lightsAABBBuffer; + std::array< ashes::BufferBasePtr, 2u > m_pointMortonCodesBuffers; + std::array< ashes::BufferBasePtr, 2u > m_spotMortonCodesBuffers; + std::array< ashes::BufferBasePtr, 2u > m_pointIndicesBuffers; + std::array< ashes::BufferBasePtr, 2u > m_spotIndicesBuffers; + ashes::BufferBasePtr m_pointBVHBuffer; + ashes::BufferBasePtr m_spotBVHBuffer; + std::vector< ashes::BufferBasePtr > m_toDelete; }; } diff --git a/include/Core/Castor3D/Render/Clustered/ReduceLightsAABB.hpp b/include/Core/Castor3D/Render/Clustered/ReduceLightsAABB.hpp new file mode 100644 index 0000000000..525187ad49 --- /dev/null +++ b/include/Core/Castor3D/Render/Clustered/ReduceLightsAABB.hpp @@ -0,0 +1,18 @@ +/* +See LICENSE file in root folder +*/ +#ifndef ___C3D_ReduceLightsAABB_H___ +#define ___C3D_ReduceLightsAABB_H___ + +#include "ClusteredModule.hpp" + +namespace castor3d +{ + C3D_API crg::FramePass const & createReduceLightsAABBPass( crg::FramePassGroup & graph + , crg::FramePass const * previousPass + , RenderDevice const & device + , CameraUbo const & cameraUbo + , FrustumClusters & clusters ); +} + +#endif diff --git a/include/Core/Castor3D/Render/RenderTarget.hpp b/include/Core/Castor3D/Render/RenderTarget.hpp index 3e715028fc..f370e8527a 100644 --- a/include/Core/Castor3D/Render/RenderTarget.hpp +++ b/include/Core/Castor3D/Render/RenderTarget.hpp @@ -423,7 +423,7 @@ namespace castor3d return m_debugConfig; } - FrustumClusters const & getFrustumClusters()const noexcept + FrustumClusters & getFrustumClusters()const noexcept { CU_Require( m_frustumClusters ); return *m_frustumClusters; diff --git a/include/Core/Castor3D/Render/RenderTechnique.hpp b/include/Core/Castor3D/Render/RenderTechnique.hpp index 0b1b96fddd..55bd6b9cd8 100644 --- a/include/Core/Castor3D/Render/RenderTechnique.hpp +++ b/include/Core/Castor3D/Render/RenderTechnique.hpp @@ -419,8 +419,7 @@ namespace castor3d PrepassRendering m_prepass; crg::FramePass const * m_lastDepthPass{}; crg::FramePass const * m_depthRangePass{}; - crg::FramePass const* m_computeClustersAABB{}; - crg::FramePass const* m_dispatchLightInClusters{}; + crg::FramePass const * m_clustersLastPass{}; BackgroundRendererUPtr m_background{}; OpaqueRendering m_opaque; crg::FramePass const * m_lastOpaquePass{}; diff --git a/include/Core/Castor3D/Shader/Shaders/GlslClusteredLights.hpp b/include/Core/Castor3D/Shader/Shaders/GlslClusteredLights.hpp index 10c415467d..2ebe5b32d7 100644 --- a/include/Core/Castor3D/Shader/Shaders/GlslClusteredLights.hpp +++ b/include/Core/Castor3D/Shader/Shaders/GlslClusteredLights.hpp @@ -40,68 +40,157 @@ namespace castor3d::shader sdw::ShaderWriter & m_writer; bool m_enabled; ClustersDataUPtr m_clusterData; - std::unique_ptr< sdw::UInt32Array > m_pointLightGridIndex; - std::unique_ptr< sdw::U32Vec2Array > m_pointLightGridCluster; - std::unique_ptr< sdw::UInt32Array > m_spotLightGridIndex; - std::unique_ptr< sdw::U32Vec2Array > m_spotLightGridCluster; + std::unique_ptr< sdw::UInt32Array > m_pointLightIndices; + std::unique_ptr< sdw::U32Vec2Array > m_pointLightClusters; + std::unique_ptr< sdw::UInt32Array > m_spotLightIndices; + std::unique_ptr< sdw::U32Vec2Array > m_spotLightClusters; }; #define C3D_ClustersAABB( writer, binding, set ) \ - auto clustersAABBBuffer = writer.declStorageBuffer( "c3D_clustersAABBBuffer" \ + auto clustersAABBBuffer = writer.declStorageBuffer( "c3d_clustersAABBBuffer" \ , uint32_t( binding ) \ , set ); \ - auto c3D_clustersAABB = clustersAABBBuffer.declMemberArray< shader::AABB >( "b" ); \ + auto c3D_clustersAABB = clustersAABBBuffer.declMemberArray< shader::AABB >( "cb" ); \ clustersAABBBuffer.end() -#define C3D_PointLightGridIndicesEx( writer, binding, set, enabled ) \ - auto pointLightGridIndexBuffer = writer.declStorageBuffer( "c3d_pointLightGridIndexBuffer" \ + + +#define C3D_PointLightClusterGridEx( writer, binding, set, enabled ) \ + auto pointLightClusterGridBuffer = writer.declStorageBuffer( "c3d_pointLightClusterGridBuffer" \ , ( enabled ? uint32_t( binding ) : 0u ) \ , set \ , sdw::type::MemoryLayout::eStd430 \ , enabled ); \ - auto c3d_pointLightGridListCount = pointLightGridIndexBuffer.declMember< sdw::UInt >( "pc" , enabled ); \ - auto c3d_pointLightGridIndices = pointLightGridIndexBuffer.declMemberArray< sdw::UInt >( "pi" , enabled ); \ - pointLightGridIndexBuffer.end() + auto c3d_pointLightClusterGrid = pointLightClusterGridBuffer.declMemberArray< sdw::UVec2 >( "pg", enabled ); \ + pointLightClusterGridBuffer.end() -#define C3D_PointLightGridClustersEx( writer, binding, set, enabled ) \ - auto pointLightGridClustersBuffer = writer.declStorageBuffer( "c3d_pointLightGridClustersBuffer" \ +#define C3D_PointLightClusterGrid( writer, binding, set ) \ + C3D_PointLightClusterGridEx( writer, binding, set, true ) + +#define C3D_SpotLightClusterGridEx( writer, binding, set, enabled ) \ + auto spotLightClusterGridBuffer = writer.declStorageBuffer( "c3d_spotLightClusterGridBuffer" \ , ( enabled ? uint32_t( binding ) : 0u ) \ , set \ , sdw::type::MemoryLayout::eStd430 \ , enabled ); \ - auto c3d_pointLightGridClusters = pointLightGridClustersBuffer.declMemberArray< sdw::UVec2 >( "pg", enabled ); \ - pointLightGridClustersBuffer.end() + auto c3d_spotLightClusterGrid = spotLightClusterGridBuffer.declMemberArray< sdw::UVec2 >( "sg", enabled ); \ + spotLightClusterGridBuffer.end() + +#define C3D_SpotLightClusterGrid( writer, binding, set ) \ + C3D_SpotLightClusterGridEx( writer, binding, set, true ) -#define C3D_SpotLightGridIndicesEx( writer, binding, set, enabled ) \ - auto spotLightGridIndexBuffer = writer.declStorageBuffer( "c3d_spotLightGridIndexBuffer" \ + + +#define C3D_PointLightClusterIndexEx( writer, binding, set, enabled ) \ + auto pointLightClusterIndexBuffer = writer.declStorageBuffer( "c3d_pointLightClusterIndexBuffer" \ , ( enabled ? uint32_t( binding ) : 0u ) \ , set \ , sdw::type::MemoryLayout::eStd430 \ , enabled ); \ - auto c3d_spotLightGridListCount = spotLightGridIndexBuffer.declMember< sdw::UInt >( "sc" , enabled ); \ - auto c3d_spotLightGridIndices = spotLightGridIndexBuffer.declMemberArray< sdw::UInt >( "si" , enabled ); \ - spotLightGridIndexBuffer.end() + auto c3d_pointLightClusterListCount = pointLightClusterIndexBuffer.declMember< sdw::UInt >( "pc" , enabled ); \ + auto c3d_pointLightClusterIndex = pointLightClusterIndexBuffer.declMemberArray< sdw::UInt >( "pg", enabled ); \ + pointLightClusterIndexBuffer.end() + +#define C3D_PointLightClusterIndex( writer, binding, set ) \ + C3D_PointLightClusterIndexEx( writer, binding, set, true ) -#define C3D_SpotLightGridClustersEx( writer, binding, set, enabled ) \ - auto spotLightGridClustersBuffer = writer.declStorageBuffer( "c3d_spotLightGridClustersBuffer" \ +#define C3D_SpotLightClusterIndexEx( writer, binding, set, enabled ) \ + auto spotLightClusterIndexBuffer = writer.declStorageBuffer( "c3d_spotLightClusterIndexBuffer" \ , ( enabled ? uint32_t( binding ) : 0u ) \ , set \ , sdw::type::MemoryLayout::eStd430 \ , enabled ); \ - auto c3d_spotLightGridClusters = spotLightGridClustersBuffer.declMemberArray< sdw::UVec2 >( "sg", enabled ); \ - spotLightGridClustersBuffer.end() + auto c3d_spotLightClusterListCount = spotLightClusterIndexBuffer.declMember< sdw::UInt >( "sc" , enabled ); \ + auto c3d_spotLightClusterIndex = spotLightClusterIndexBuffer.declMemberArray< sdw::UInt >( "sg", enabled ); \ + spotLightClusterIndexBuffer.end() + +#define C3D_SpotLightClusterIndex( writer, binding, set ) \ + C3D_SpotLightClusterIndexEx( writer, binding, set, true ) + + + +#define C3D_LightsAABB( writer, binding, set ) \ + auto lightsAABBBuffer = writer.declStorageBuffer( "c3d_lightsAABBBuffer" \ + , uint32_t( binding ) \ + , set ); \ + auto c3d_lightsAABB = lightsAABBBuffer.declMemberArray< shader::AABB >( "cb" ); \ + lightsAABBBuffer.end() + +#define C3D_PointLightBVHEx( writer, binding, set, enabled ) \ + auto pointLightBVHBuffer = writer.declStorageBuffer( "c3d_pointLightBVHBuffer" \ + , uint32_t( binding ) \ + , set \ + , sdw::type::MemoryLayout::eStd430 \ + , enabled ); \ + auto c3d_pointLightBVH = pointLightBVHBuffer.declMemberArray< shader::AABB >( "pb", enabled ); \ + pointLightBVHBuffer.end() + +#define C3D_PointLightBVH( writer, binding, set ) \ + C3D_PointLightBVHEx( writer, binding, set, true ) + +#define C3D_SpotLightBVHEx( writer, binding, set, enabled ) \ + auto spotLightBVHBuffer = writer.declStorageBuffer( "c3d_spotLightBVHBuffer" \ + , uint32_t( binding ) \ + , set \ + , sdw::type::MemoryLayout::eStd430 \ + , enabled ); \ + auto c3d_spotLightBVH = spotLightBVHBuffer.declMemberArray< shader::AABB >( "sb", enabled ); \ + spotLightBVHBuffer.end() + +#define C3D_SpotLightBVH( writer, binding, set ) \ + C3D_SpotLightBVHEx( writer, binding, set, true ) -#define C3D_PointLightGridIndices( writer, binding, set ) \ - C3D_PointLightGridIndicesEx( writer, binding, set, true ) -#define C3D_PointLightGridClusters( writer, binding, set ) \ - C3D_PointLightGridClustersEx( writer, binding, set, true ) -#define C3D_SpotLightGridIndices( writer, binding, set ) \ - C3D_SpotLightGridIndicesEx( writer, binding, set, true ) +#define C3D_PointLightIndicesEx( writer, binding, set, enabled ) \ + auto pointLightIndicesBuffer = writer.declStorageBuffer( "c3d_pointLightIndicesBuffer" \ + , ( enabled ? uint32_t( binding ) : 0u ) \ + , set \ + , sdw::type::MemoryLayout::eStd430 \ + , enabled ); \ + auto c3d_pointLightIndices = pointLightIndicesBuffer.declMemberArray< sdw::UInt >( "pi" , enabled ); \ + pointLightIndicesBuffer.end() + +#define C3D_PointLightIndices( writer, binding, set ) \ + C3D_PointLightIndicesEx( writer, binding, set, true ) + +#define C3D_SpotLightIndicesEx( writer, binding, set, enabled ) \ + auto spotLightIndicesBuffer = writer.declStorageBuffer( "c3d_spotLightIndicesBuffer" \ + , ( enabled ? uint32_t( binding ) : 0u ) \ + , set \ + , sdw::type::MemoryLayout::eStd430 \ + , enabled ); \ + auto c3d_spotLightIndices = spotLightIndicesBuffer.declMemberArray< sdw::UInt >( "si" , enabled ); \ + spotLightIndicesBuffer.end() + +#define C3D_SpotLightIndices( writer, binding, set ) \ + C3D_SpotLightIndicesEx( writer, binding, set, true ) + + + +#define C3D_PointLightMortonCodesEx( writer, binding, set, enabled ) \ + auto pointLightMortonCodesBuffer = writer.declStorageBuffer( "c3d_pointLightMortonCodesBuffer" \ + , ( enabled ? uint32_t( binding ) : 0u ) \ + , set \ + , sdw::type::MemoryLayout::eStd430 \ + , enabled ); \ + auto c3d_pointLightMortonCodes = pointLightMortonCodesBuffer.declMemberArray< sdw::UInt >( "pm", enabled ); \ + pointLightMortonCodesBuffer.end() + +#define C3D_PointLightMortonCodes( writer, binding, set ) \ + C3D_PointLightMortonCodesEx( writer, binding, set, true ) + +#define C3D_SpotLightMortonCodesEx( writer, binding, set, enabled ) \ + auto spotLightMortonCodesBuffer = writer.declStorageBuffer( "c3d_spotLightMortonCodesBuffer" \ + , ( enabled ? uint32_t( binding ) : 0u ) \ + , set \ + , sdw::type::MemoryLayout::eStd430 \ + , enabled ); \ + auto c3d_spotLightMortonCodes= spotLightMortonCodesBuffer.declMemberArray< sdw::UInt >( "sm", enabled ); \ + spotLightMortonCodesBuffer.end() -#define C3D_SpotLightGridClusters( writer, binding, set ) \ - C3D_SpotLightGridClustersEx( writer, binding, set, true ) +#define C3D_SpotLightMortonCodes( writer, binding, set ) \ + C3D_SpotLightMortonCodesEx( writer, binding, set, true ) } #endif diff --git a/include/Core/Castor3D/Shader/Shaders/SdwModule.hpp b/include/Core/Castor3D/Shader/Shaders/SdwModule.hpp index 2965195a08..5a0f4fa68c 100644 --- a/include/Core/Castor3D/Shader/Shaders/SdwModule.hpp +++ b/include/Core/Castor3D/Shader/Shaders/SdwModule.hpp @@ -217,6 +217,7 @@ namespace castor3d::shader C3D_API uint32_t getSpotShadowMapCount(); C3D_API uint32_t getPointShadowMapCount(); + C3D_API void groupMemoryBarrierWithGroupSync( sdw::ShaderWriter & writer ); C3D_API castor::String concatModelNames( castor::String lhs , castor::String rhs ); diff --git a/include/Core/Castor3D/Shader/Ubos/CameraUbo.hpp b/include/Core/Castor3D/Shader/Ubos/CameraUbo.hpp index a4af09f67b..6cacbad03a 100644 --- a/include/Core/Castor3D/Shader/Ubos/CameraUbo.hpp +++ b/include/Core/Castor3D/Shader/Ubos/CameraUbo.hpp @@ -289,15 +289,18 @@ namespace castor3d }; } -#define C3D_Camera( writer, binding, set )\ +#define C3D_CameraEx( writer, binding, set, enabled )\ sdw::UniformBuffer camera{ writer\ , "C3D_Camera"\ , "c3d_camera"\ , uint32_t( binding )\ , uint32_t( set )\ , sdw::type::MemoryLayout::eStd140\ - , true };\ - auto c3d_cameraData = camera.declMember< castor3d::shader::CameraData >( "c" );\ + , enabled };\ + auto c3d_cameraData = camera.declMember< castor3d::shader::CameraData >( "c", enabled );\ camera.end() +#define C3D_Camera( writer, binding, set )\ + C3D_CameraEx( writer, binding, set, true ) + #endif diff --git a/include/Core/Castor3D/Shader/Ubos/ClustersUbo.hpp b/include/Core/Castor3D/Shader/Ubos/ClustersUbo.hpp index 9b0fcd670b..b0e2baab45 100644 --- a/include/Core/Castor3D/Shader/Ubos/ClustersUbo.hpp +++ b/include/Core/Castor3D/Shader/Ubos/ClustersUbo.hpp @@ -22,7 +22,11 @@ namespace castor3d , sdw::FloatField< "viewNear" > // Distance to the near clipping plane. , sdw::U32Vec2Field< "clusterSize" > // Screenspace size of a cluster , sdw::FloatField< "nearK" > // ( 1 + ( 2 * tan( fov * 0.5 ) / dimensions.y ) ) // Used to compute the near plane for clusters at depth k. - , sdw::FloatField< "logGridDimY" > > // 1.0f / log( 1 + ( tan( fov * 0.5 ) / dimensions.y ) + , sdw::FloatField< "logGridDimY" > // 1.0f / log( 1 + ( tan( fov * 0.5 ) / dimensions.y ) + , sdw::UIntField< "pointLightLevels" > + , sdw::UIntField< "spotLightLevels" > + , sdw::UIntField< "pointLightCount" > + , sdw::UIntField< "spotLightCount" > > { ClustersData( sdw::ShaderWriter & writer , ast::expr::ExprPtr expr @@ -36,6 +40,10 @@ namespace castor3d auto clusterSize()const { return getMember< "clusterSize" >(); } auto nearK()const { return getMember< "nearK" >(); } auto logGridDimY()const { return getMember< "logGridDimY" >(); } + auto pointLightLevels()const { return getMember< "pointLightLevels" >(); } + auto spotLightLevels()const { return getMember< "spotLightLevels" >(); } + auto pointLightCount()const { return getMember< "pointLightCount" >(); } + auto spotLightCount()const { return getMember< "spotLightCount" >(); } C3D_API sdw::RetU32Vec3 computeClusterIndex3D( sdw::UInt32 const index ); C3D_API sdw::RetU32Vec3 computeClusterIndex3D( sdw::Vec2 const screenPos @@ -68,11 +76,9 @@ namespace castor3d C3D_API void cpuUpdate( castor::Point3ui gridDim , float viewNear , uint32_t clusterSize - , castor::Angle const & fov ); - C3D_API void cpuUpdate( castor::Point3ui gridDim - , float viewNear - , uint32_t clusterSize - , float nearK ); + , float nearK + , uint32_t pointLightsCount + , uint32_t spotLightsCount ); void createPassBinding( crg::FramePass & pass , uint32_t binding )const @@ -119,7 +125,7 @@ namespace castor3d , uint32_t( set ) \ , ast::type::MemoryLayout::eStd140 \ , enabled }; \ - auto c3d_clustersData = clusters.declMember< castor3d::shader::ClustersData >( "d", enabled ); \ + auto c3d_clustersData = clusters.declMember< castor3d::shader::ClustersData >( "c", enabled ); \ clusters.end() #define C3D_Clusters( writer, binding, set ) \ diff --git a/include/Core/Castor3D/Shader/Ubos/UbosModule.hpp b/include/Core/Castor3D/Shader/Ubos/UbosModule.hpp index 67ec555284..fb3a7b2260 100644 --- a/include/Core/Castor3D/Shader/Ubos/UbosModule.hpp +++ b/include/Core/Castor3D/Shader/Ubos/UbosModule.hpp @@ -238,6 +238,14 @@ namespace castor3d float nearK; // 1.0f / log( 1 + ( tan( fov * 0.5 ) / ClusterGridDim.y ) float logGridDimY; + // The number of BVH levels for point lights. + uint32_t pointLightLevelsCount; + // The number of BVH levels for spot lights. + uint32_t spotLightLevelsCount; + // The number of point lights. + uint32_t pointLightsCount; + // The number of spot lights. + uint32_t spotLightsCount; }; /** *\~english diff --git a/include/Core/CastorUtils/Config/SmartPtr.hpp b/include/Core/CastorUtils/Config/SmartPtr.hpp index 2778a411bf..64989ead8b 100644 --- a/include/Core/CastorUtils/Config/SmartPtr.hpp +++ b/include/Core/CastorUtils/Config/SmartPtr.hpp @@ -44,6 +44,18 @@ namespace castor { return UniquePtr< TypeU >( &static_cast< TypeU & >( *ptr.release() ) ); } + + template< typename TypeU, typename TypeT > + std::unique_ptr< TypeU > ptrCast( std::unique_ptr< TypeT > ptr ) + { + return std::unique_ptr< TypeU >( &static_cast< TypeU & >( *ptr.release() ) ); + } + + template< typename TypeU, typename TypeT > + std::unique_ptr< TypeU > ptrRefCast( std::unique_ptr< TypeT > & ptr ) + { + return std::unique_ptr< TypeU >( &static_cast< TypeU & >( *ptr.release() ) ); + } } #define CU_DeclareSmartPtr( nmspc, class_name, expdecl )\ diff --git a/include/Core/CastorUtils/Design/DataHolder.hpp b/include/Core/CastorUtils/Design/DataHolder.hpp index 892d4f8271..2d4e0d49af 100644 --- a/include/Core/CastorUtils/Design/DataHolder.hpp +++ b/include/Core/CastorUtils/Design/DataHolder.hpp @@ -8,7 +8,7 @@ See LICENSE file in root folder namespace castor { - template< class Data > + template< class Data, size_t Index > class DataHolderT { public: diff --git a/include/Core/CastorUtils/Design/DesignModule.hpp b/include/Core/CastorUtils/Design/DesignModule.hpp index 6c2be3026a..2305e7851f 100644 --- a/include/Core/CastorUtils/Design/DesignModule.hpp +++ b/include/Core/CastorUtils/Design/DesignModule.hpp @@ -122,7 +122,7 @@ namespace castor *\brief Conteneur de donnée. *\remark Utilisé pour forcer l'ordre d'initialisation des données via l'héritage. */ - template< class Data > + template< class Data, size_t Index = 0u > class DataHolderT; /** \~english diff --git a/include/Core/CastorUtils/Exception/Assertion.hpp b/include/Core/CastorUtils/Exception/Assertion.hpp index 28044b45c4..7f54bf7d52 100644 --- a/include/Core/CastorUtils/Exception/Assertion.hpp +++ b/include/Core/CastorUtils/Exception/Assertion.hpp @@ -23,11 +23,11 @@ namespace castor # if !defined( NDEBUG ) # define CU_Assert( pred, text ) ( !!( pred ) )\ - || ( castor::cuFailure( text ), 0 ) -#else + || ( castor::cuFailure( text ), 0 ) +# else # define CU_Assert( pred, text ) ( !!( pred ) )\ - || ( castor::cuLogError( text ), 0 ) -#endif + || ( castor::cuLogError( text ), 0 ) +# endif //!\~english Calls invariant checking function. //!\~french Appelle la fonction de vérification des invariants de classe. diff --git a/source/Core/Castor3D/CMakeLists.txt b/source/Core/Castor3D/CMakeLists.txt index ca029ab768..e8fc40c6bc 100644 --- a/source/Core/Castor3D/CMakeLists.txt +++ b/source/Core/Castor3D/CMakeLists.txt @@ -1132,15 +1132,21 @@ source_group( "Source Files\\Render" FILES ${${PROJECT_NAME}_FOLDER_SRC_FILES} ) set( ${PROJECT_NAME}_FOLDER_SRC_FILES ${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/AssignLightsToClusters.cpp + ${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/BuildLightsBVH.cpp ${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/ClusteredModule.cpp ${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/ComputeClustersAABB.cpp + ${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/ComputeLightsMortonCode.cpp ${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/FrustumClusters.cpp + ${CASTOR_SOURCE_DIR}/source/Core/${PROJECT_NAME}/Render/Clustered/ReduceLightsAABB.cpp ) set( ${PROJECT_NAME}_FOLDER_HDR_FILES ${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/AssignLightsToClusters.hpp + ${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/BuildLightsBVH.hpp ${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/ClusteredModule.hpp ${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/ComputeClustersAABB.hpp + ${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/ComputeLightsMortonCode.hpp ${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/FrustumClusters.hpp + ${CASTOR_SOURCE_DIR}/include/Core/${PROJECT_NAME}/Render/Clustered/ReduceLightsAABB.hpp ) set( ${PROJECT_NAME}_SRC_FILES ${${PROJECT_NAME}_SRC_FILES} diff --git a/source/Core/Castor3D/DebugDefines.hpp b/source/Core/Castor3D/DebugDefines.hpp index eca2f37947..f5970f3f1e 100644 --- a/source/Core/Castor3D/DebugDefines.hpp +++ b/source/Core/Castor3D/DebugDefines.hpp @@ -15,4 +15,6 @@ See LICENSE file in root folder #define C3D_DebugDisableShadowMaps 0 #define C3D_DebugDisableSafeBands 0 +#define C3D_DebugUseLightsBVH 1 + #endif diff --git a/source/Core/Castor3D/Event/Frame/CpuFrameEvent.cpp b/source/Core/Castor3D/Event/Frame/CpuFrameEvent.cpp index c11df54bc9..b08321bfa2 100644 --- a/source/Core/Castor3D/Event/Frame/CpuFrameEvent.cpp +++ b/source/Core/Castor3D/Event/Frame/CpuFrameEvent.cpp @@ -2,12 +2,14 @@ #include +#define C3D_UseEventsStack 0 + namespace castor3d { CpuFrameEvent::CpuFrameEvent( CpuFrameEvent const & rhs ) : m_type{ rhs.m_type } , m_skip{ rhs.m_skip.load() } -#if !defined( NDEBUG ) +#if !defined( NDEBUG ) && C3D_UseEventsStack , m_stackTrace{ rhs.m_stackTrace } #endif { @@ -16,7 +18,7 @@ namespace castor3d CpuFrameEvent::CpuFrameEvent( CpuFrameEvent && rhs ) : m_type{ rhs.m_type } , m_skip{ rhs.m_skip.load() } -#if !defined( NDEBUG ) +#if !defined( NDEBUG ) && C3D_UseEventsStack , m_stackTrace{ std::move( rhs.m_stackTrace ) } #endif { @@ -27,7 +29,7 @@ namespace castor3d { m_type = rhs.m_type; m_skip = rhs.m_skip.load(); -#if !defined( NDEBUG ) +#if !defined( NDEBUG ) && C3D_UseEventsStack m_stackTrace = rhs.m_stackTrace; #endif @@ -38,7 +40,7 @@ namespace castor3d { m_type = rhs.m_type; m_skip = rhs.m_skip.load(); -#if !defined( NDEBUG ) +#if !defined( NDEBUG ) && C3D_UseEventsStack m_stackTrace = std::move( rhs.m_stackTrace ); #endif rhs.m_skip = true; @@ -49,7 +51,7 @@ namespace castor3d CpuFrameEvent::CpuFrameEvent( EventType type ) : m_type{ type } { -#if !defined( NDEBUG ) +#if !defined( NDEBUG ) && C3D_UseEventsStack castor::StringStream stream = castor::makeStringStream(); stream << castor::Debug::Backtrace{ 20 }; diff --git a/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp b/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp index 4cbed6bc56..4ad5400dca 100644 --- a/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp +++ b/source/Core/Castor3D/Render/Clustered/AssignLightsToClusters.cpp @@ -1,10 +1,13 @@ #include "Castor3D/Render/Clustered/AssignLightsToClusters.hpp" +#include "Castor3D/DebugDefines.hpp" #include "Castor3D/Engine.hpp" #include "Castor3D/Cache/LightCache.hpp" #include "Castor3D/Render/RenderDevice.hpp" #include "Castor3D/Render/RenderSystem.hpp" #include "Castor3D/Render/Clustered/FrustumClusters.hpp" +#include "Castor3D/Scene/Camera.hpp" +#include "Castor3D/Scene/Scene.hpp" #include "Castor3D/Scene/Light/PointLight.hpp" #include "Castor3D/Scene/Light/SpotLight.hpp" #include "Castor3D/Shader/Program.hpp" @@ -35,16 +38,33 @@ namespace castor3d eLights, eClusters, eClustersAABB, - ePointLightGridIndex, - ePointLightGridCluster, - eSpotLightGridIndex, - eSpotLightGridCluster, + ePointLightIndex, + ePointLightCluster, + eSpotLightIndex, + eSpotLightCluster, + ePointLightBVH, + eSpotLightBVH, + ePointLightIndices, + eSpotLightIndices, }; - static ShaderPtr createShader() +#if C3D_DebugUseLightsBVH + static uint32_t constexpr NumThreads = 32u; +#else + static uint32_t constexpr NumThreads = MaxLightsPerCluster; +#endif + + static ShaderPtr createShader( bool useLightBVH ) { - using namespace sdw; - ComputeWriter writer; + sdw::ComputeWriter writer; + + auto c3d_numChildNodes = writer.declConstantArray< sdw::UInt >( "c3d_numChildNodes" + , { 1_u /* 1 level =32^0 */ + , 33_u /* 2 levels +32^1 */ + , 1057_u /* 3 levels +32^2 */ + , 33825_u /* 4 levels +32^3 */ + , 1082401_u /* 5 levels +32^4 */ + , 34636833_u /* 6 levels +32^5 */ } ); // Inputs C3D_Camera( writer @@ -59,18 +79,42 @@ namespace castor3d C3D_ClustersAABB( writer , eClustersAABB , 0u ); - C3D_PointLightGridIndices( writer - , ePointLightGridIndex + C3D_PointLightClusterIndex( writer + , ePointLightIndex , 0u ); - C3D_PointLightGridClusters( writer - , ePointLightGridCluster + C3D_PointLightClusterGrid( writer + , ePointLightCluster , 0u ); - C3D_SpotLightGridIndices( writer - , eSpotLightGridIndex + C3D_SpotLightClusterIndex( writer + , eSpotLightIndex , 0u ); - C3D_SpotLightGridClusters( writer - , eSpotLightGridCluster + C3D_SpotLightClusterGrid( writer + , eSpotLightCluster , 0u ); + C3D_PointLightBVHEx( writer + , ePointLightBVH + , 0u + , useLightBVH ); + C3D_SpotLightBVHEx( writer + , eSpotLightBVH + , 0u + , useLightBVH ); + C3D_PointLightIndicesEx( writer + , ePointLightIndices + , 0u + , useLightBVH ); + C3D_SpotLightIndicesEx( writer + , eSpotLightIndices + , 0u + , useLightBVH ); + + // Using a stack of node IDs to traverse the BVH was inspired by: + // Source: https://devblogs.nvidia.com/parallelforall/thinking-parallel-part-ii-tree-traversal-gpu/ + // Author: Tero Karras (NVIDIA) + // Retrieved: September 13, 2016 + auto gsNodeStack = writer.declSharedVariable< sdw::UInt >( "gsNodeStack", 1024u, useLightBVH ); // This should be enough to push 32 layers of nodes (32 nodes per layer). + auto gsStackPtr = writer.declSharedVariable< sdw::Int >( "gsStackPtr", useLightBVH ); // The current index in the node stack. + auto gsParentIndex = writer.declSharedVariable< sdw::UInt >( "gsParentIndex", useLightBVH ); // The index of the parent node in the BVH that is currently being processed. auto gsPointLightStartOffset = writer.declSharedVariable< sdw::UInt >( "gsPointLightStartOffset" ); auto gsPointLights = writer.declSharedVariable< shader::AppendArrayT< sdw::UInt > >( "gsPointLights" @@ -86,6 +130,95 @@ namespace castor3d auto gsClusterSphere = writer.declSharedVariable< sdw::Vec4 >( "gsClusterSphere" ); shader::Utils utils{ writer }; + sdw::Function< sdw::Void, sdw::InUInt > pushNode; + sdw::Function< sdw::UInt > popNode; + sdw::Function< sdw::Boolean, shader::InAABB, shader::InAABB > aabbIntersectAABB; + + std::function< sdw::UInt( sdw::UInt, sdw::UInt ) > getFirstChild; + std::function< sdw::Boolean( sdw::UInt, sdw::UInt ) > isLeafNode; + std::function< sdw::UInt( sdw::UInt, sdw::UInt ) > getLeafIndex; + + if ( useLightBVH ) + { + pushNode = writer.implementFunction< sdw::Void >( "pushNode" + , [&]( sdw::UInt const & nodeIndex ) + { + auto stackPtr = writer.declLocale( "stackPtr" + , sdw::atomicAdd( gsStackPtr, 1_i ) ); + + IF( writer, stackPtr < 1024 ) + { + gsNodeStack[stackPtr] = nodeIndex; + } + FI; + } + , sdw::InUInt{ writer, "nodeIndex" } ); + + popNode = writer.implementFunction< sdw::UInt >( "popNode" + , [&]() + { + auto nodeIndex = writer.declLocale( "nodeIndex" + , 0_u ); + auto stackPtr = writer.declLocale( "stackPtr" + , sdw::atomicAdd( gsStackPtr, -1_i ) ); + + IF( writer, stackPtr > 0 && stackPtr < 1024 ) + { + nodeIndex = gsNodeStack[stackPtr - 1]; + } + FI; + + writer.returnStmt( nodeIndex ); + } ); + + // Check to see if on AABB intersects another AABB. + // Source: Real-time collision detection, Christer Ericson (2005) + aabbIntersectAABB = writer.implementFunction< sdw::Boolean >( "aabbIntersectAABB" + , [&]( shader::AABB const & a + , shader::AABB const & b ) + { + auto result = writer.declLocale( "result" + , 1_b ); + + for ( int i = 0; i < 3; ++i ) + { + result = result + && ( a.max()[i] >= b.min()[i] + && a.min()[i] <= b.max()[i] ); + } + + writer.returnStmt( result ); + } + , shader::InAABB{ writer, "a" } + , shader::InAABB{ writer, "b" } ); + + // Get the index of the the first child node in the BVH. + getFirstChild = [&]( sdw::UInt const parentIndex + , sdw::UInt const numLevels ) + { + return writer.ternary( numLevels > 0_u + , parentIndex * 32_u + 1_u + , 0_u ); + }; + + // Check to see if an index of the BVH is a leaf. + isLeafNode = [&]( sdw::UInt const childIndex + , sdw::UInt const numLevels ) + { + return writer.ternary( numLevels > 0_u + , childIndex > ( c3d_numChildNodes[numLevels - 1_u] - 1_u ) + , 1_b ); + }; + + // Get the index of a leaf node given the node ID in the BVH. + getLeafIndex = [&]( sdw::UInt const nodeIndex + , sdw::UInt const numLevels ) + { + return writer.ternary( numLevels > 0_u + , nodeIndex - c3d_numChildNodes[numLevels - 1_u] + , nodeIndex ); + }; + } auto sphereInsideAABB = writer.implementFunction< sdw::Boolean >( "sphereInsideAABB" , [&]( sdw::Vec3 const & sphereCenter @@ -141,120 +274,243 @@ namespace castor3d , shader::InCone{ writer, "cone" } , sdw::InVec4{ writer, "sphere" } ); - writer.implementMainT< VoidT >( MaxLightsPerCluster - , [&]( ComputeIn in ) + writer.implementMainT< sdw::VoidT >( NumThreads + , [&]( sdw::ComputeIn in ) { - auto groupIndex = writer.declLocale( "groupIndex" - , writer.cast< sdw::UInt >( in.localInvocationIndex ) ); + auto groupIndex = in.localInvocationIndex; - IF( writer, groupIndex == 0u ) + IF( writer, groupIndex == 0_u ) { gsPointLights.resetCount(); gsSpotLights.resetCount(); + gsStackPtr = 0_i; + gsParentIndex = 0_u; - gsClusterIndex3D = in.workGroupID; - gsClusterIndex1D = c3d_clustersData.computeClusterIndex1D( gsClusterIndex3D ); + gsClusterIndex1D = in.workGroupID.x(); + gsClusterIndex3D = c3d_clustersData.computeClusterIndex3D( gsClusterIndex1D ); gsClusterAABB = c3D_clustersAABB[gsClusterIndex1D]; auto aabbCenter = writer.declLocale( "aabbCenter" - , gsClusterAABB.min().xyz() + ( gsClusterAABB.max().xyz() - gsClusterAABB.min().xyz() ) / 2.0f ); + , gsClusterAABB.min().xyz() + ( gsClusterAABB.max().xyz() - gsClusterAABB.min().xyz() ) / 2.0_f ); gsClusterSphere = vec4( aabbCenter, distance( gsClusterAABB.max().xyz(), aabbCenter ) ); + + if ( useLightBVH ) + { + pushNode( 0_u ); + } } FI; - writer.controlBarrier( sdw::type::Scope::eWorkgroup - , sdw::type::Scope::eWorkgroup - , ( sdw::type::MemorySemanticsMask::eAcquireRelease - | sdw::type::MemorySemanticsMask::eWorkgroupMemory ) ); - - // Intersect point lights against AABB. - auto cur = writer.declLocale( "cur" - , lights.getDirectionalsEnd() + groupIndex * PointLight::LightDataComponents ); - auto end = writer.declLocale( "end" - , lights.getPointsEnd() ); + shader::groupMemoryBarrierWithGroupSync( writer ); - WHILE( writer, cur < lights.getPointsEnd() ) + if ( useLightBVH ) { - auto point = writer.declLocale( "point" - , lights.getPointLight( cur ) ); + auto childOffset = writer.declLocale( "childOffset", groupIndex ); + + // Check point light BVH + DOWHILE( writer, gsParentIndex > 0_u ) + { + auto childIndex = writer.declLocale( "childIndex" + , getFirstChild( gsParentIndex, c3d_clustersData.pointLightLevels() ) + childOffset ); + + IF( writer, isLeafNode( childIndex, c3d_clustersData.pointLightLevels() ) ) + { + auto leafIndex = writer.declLocale( "leafIndex" + , getLeafIndex( childIndex, c3d_clustersData.pointLightLevels() ) ); + + IF( writer, leafIndex < c3d_clustersData.pointLightCount() ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , c3d_pointLightIndices[leafIndex] ); + auto point = writer.declLocale( "point" + , lights.getPointLight( lightOffset ) ); + + IF( writer, sphereInsideAABB( c3d_cameraData.worldToCurView( vec4( point.position(), 1.0_f ) ).xyz() + , point.base().farPlane() + , gsClusterAABB ) ) + { + gsPointLights.appendData( lightOffset, MaxLightsPerCluster ); + } + FI; + } + FI; + } + ELSEIF( aabbIntersectAABB( gsClusterAABB, c3d_pointLightBVH[childIndex] ) ) + { + pushNode( childIndex ); + } + FI; + + shader::groupMemoryBarrierWithGroupSync( writer ); + + IF( writer, groupIndex == 0_u ) + { + gsParentIndex = popNode(); + } + FI; + + shader::groupMemoryBarrierWithGroupSync( writer ); + } + ELIHWOD; + + shader::groupMemoryBarrierWithGroupSync( writer ); - IF( writer, sphereInsideAABB( c3d_cameraData.worldToCurView( vec4( point.position(), 1.0_f ) ).xyz() - , point.base().farPlane() - , gsClusterAABB ) ) + // Reset the stack. + IF( writer, groupIndex == 0_u ) { - gsPointLights.appendData( cur, MaxLightsPerCluster ); + gsStackPtr = 0_i; + gsParentIndex = 0_u; + + // Push the root node (at index 0) on the node stack. + pushNode( 0_u ); } FI; - cur += PointLight::LightDataComponents * MaxLightsPerCluster; - } - ELIHW; + shader::groupMemoryBarrierWithGroupSync( writer ); - // Intersect spot lights against AABB. - cur = end + groupIndex * SpotLight::LightDataComponents; - end = lights.getSpotsEnd(); + // Check spot light BVH + DOWHILE( writer, gsParentIndex > 0_u ) + { + auto childIndex = writer.declLocale( "childIndex" + , getFirstChild( gsParentIndex, c3d_clustersData.spotLightLevels() ) + childOffset ); - WHILE( writer, cur < end ) - { - auto spot = writer.declLocale( "spot" - , lights.getSpotLight( cur ) ); + IF( writer, isLeafNode( childIndex, c3d_clustersData.spotLightLevels() ) ) + { + auto leafIndex = writer.declLocale( "leafIndex" + , getLeafIndex( childIndex, c3d_clustersData.spotLightLevels() ) ); + + IF( writer, leafIndex < c3d_clustersData.spotLightCount() ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , c3d_spotLightIndices[leafIndex] ); + auto spot = writer.declLocale( "spot" + , lights.getSpotLight( lightOffset ) ); #if 1 - IF( writer, sphereInsideAABB( c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ).xyz() - , spot.base().farPlane() - , gsClusterAABB ) ) + IF( writer, sphereInsideAABB( c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ).xyz() + , spot.base().farPlane() + , gsClusterAABB ) ) #else - IF( writer, coneInsideSphere( shader::Cone{ c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ).xyz() - , spot.direction() - , spot.base().farPlane() - , spot.outerCutOff() - , spot.outerCutOffCos() - , spot.outerCutOffSin() } - , gsClusterSphere ) ) + IF( writer, coneInsideSphere( shader::Cone{ c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ).xyz() + , spot.direction() + , spot.base().farPlane() + , spot.outerCutOff() + , spot.outerCutOffCos() + , spot.outerCutOffSin() } + , gsClusterSphere ) ) #endif - { - gsSpotLights.appendData( cur, MaxLightsPerCluster ); + { + gsSpotLights.appendData( lightOffset, MaxLightsPerCluster ); + } + FI; + } + FI; + } + ELSEIF( aabbIntersectAABB( gsClusterAABB, c3d_spotLightBVH[childIndex] ) ) + { + pushNode( childIndex ); + } + FI; + + shader::groupMemoryBarrierWithGroupSync( writer ); + + IF( writer, groupIndex == 0_u ) + { + gsParentIndex = popNode(); + } + FI; + + shader::groupMemoryBarrierWithGroupSync( writer ); } - FI; + ELIHWOD; - cur += SpotLight::LightDataComponents * MaxLightsPerCluster; + shader::groupMemoryBarrierWithGroupSync( writer ); } - ELIHW; + else + { + // Intersect point lights against AABB. + auto cur = writer.declLocale( "cur" + , lights.getDirectionalsEnd() + groupIndex * PointLight::LightDataComponents ); + auto end = writer.declLocale( "end" + , lights.getPointsEnd() ); - writer.controlBarrier( sdw::type::Scope::eWorkgroup - , sdw::type::Scope::eWorkgroup - , ( sdw::type::MemorySemanticsMask::eAcquireRelease - | sdw::type::MemorySemanticsMask::eWorkgroupMemory ) ); + WHILE( writer, cur < lights.getPointsEnd() ) + { + auto point = writer.declLocale( "point" + , lights.getPointLight( cur ) ); + + IF( writer, sphereInsideAABB( c3d_cameraData.worldToCurView( vec4( point.position(), 1.0_f ) ).xyz() + , point.base().farPlane() + , gsClusterAABB ) ) + { + gsPointLights.appendData( cur, MaxLightsPerCluster ); + } + FI; + + cur += PointLight::LightDataComponents * MaxLightsPerCluster; + } + ELIHW; + + // Intersect spot lights against AABB. + cur = end + groupIndex * SpotLight::LightDataComponents; + end = lights.getSpotsEnd(); + + WHILE( writer, cur < end ) + { + auto spot = writer.declLocale( "spot" + , lights.getSpotLight( cur ) ); + +#if 1 + IF( writer, sphereInsideAABB( c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ).xyz() + , spot.base().farPlane() + , gsClusterAABB ) ) +#else + IF( writer, coneInsideSphere( shader::Cone{ c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ).xyz() + , spot.direction() + , spot.base().farPlane() + , spot.outerCutOff() + , spot.outerCutOffCos() + , spot.outerCutOffSin() } + , gsClusterSphere ) ) +#endif + { + gsSpotLights.appendData( cur, MaxLightsPerCluster ); + } + FI; + + cur += SpotLight::LightDataComponents * MaxLightsPerCluster; + } + ELIHW; + + shader::groupMemoryBarrierWithGroupSync( writer ); + } // Now update the global light grids with the light lists and light counts. IF( writer, groupIndex == 0u ) { gsPointLights.getCount() = min( sdw::UInt{ MaxLightsPerCluster }, gsPointLights.getCount() ); - gsPointLightStartOffset = sdw::atomicAdd( c3d_pointLightGridListCount, gsPointLights.getCount() ); - c3d_pointLightGridClusters[gsClusterIndex1D] = sdw::uvec2( gsPointLightStartOffset, gsPointLights.getCount() ); + gsPointLightStartOffset = sdw::atomicAdd( c3d_pointLightClusterListCount, gsPointLights.getCount() ); + c3d_pointLightClusterGrid[gsClusterIndex1D] = sdw::uvec2( gsPointLightStartOffset, gsPointLights.getCount() ); gsSpotLights.getCount() = min( sdw::UInt{ MaxLightsPerCluster }, gsSpotLights.getCount() ); - gsSpotLightStartOffset = sdw::atomicAdd( c3d_spotLightGridListCount, gsSpotLights.getCount() ); - c3d_spotLightGridClusters[gsClusterIndex1D] = sdw::uvec2( gsSpotLightStartOffset, gsSpotLights.getCount() ); + gsSpotLightStartOffset = sdw::atomicAdd( c3d_spotLightClusterListCount, gsSpotLights.getCount() ); + c3d_spotLightClusterGrid[gsClusterIndex1D] = sdw::uvec2( gsSpotLightStartOffset, gsSpotLights.getCount() ); } FI; - writer.controlBarrier( sdw::type::Scope::eWorkgroup - , sdw::type::Scope::eWorkgroup - , ( sdw::type::MemorySemanticsMask::eAcquireRelease - | sdw::type::MemorySemanticsMask::eWorkgroupMemory ) ); + shader::groupMemoryBarrierWithGroupSync( writer ); // Now update the global light index lists with the group shared light lists. - IF( writer, groupIndex < gsPointLights.getCount() ) + FOR( writer, sdw::UInt, i, groupIndex, i < gsPointLights.getCount(), i += NumThreads ) { - c3d_pointLightGridIndices[gsPointLightStartOffset + groupIndex] = gsPointLights[groupIndex]; + c3d_pointLightClusterIndex[gsPointLightStartOffset + i] = gsPointLights[i]; } - FI; + ROF; - IF( writer, groupIndex < gsSpotLights.getCount() ) + FOR( writer, sdw::UInt, i, groupIndex, i < gsSpotLights.getCount(), i += NumThreads ) { - c3d_spotLightGridIndices[gsSpotLightStartOffset + groupIndex] = gsSpotLights[groupIndex]; + c3d_spotLightClusterIndex[gsSpotLightStartOffset + i] = gsSpotLights[i]; } - FI; + ROF; } ); return std::make_unique< ast::Shader >( std::move( writer.getShader() ) ); } @@ -276,7 +532,7 @@ namespace castor3d , RenderDevice const & device , FrustumClusters const & clusters , crg::cp::Config config ) - : ShaderHolder{ ShaderModule{ VK_SHADER_STAGE_COMPUTE_BIT, "AssignLightsToClusters", dspclst::createShader() } } + : ShaderHolder{ ShaderModule{ VK_SHADER_STAGE_COMPUTE_BIT, "AssignLightsToClusters", dspclst::createShader( C3D_DebugUseLightsBVH != 0 ) } } , CreateInfoHolder{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, ShaderHolder::getData() ) } } , EnabledHolder{ true } , crg::ComputePass{framePass @@ -322,9 +578,8 @@ namespace castor3d crg::FramePass const & createAssignLightsToClustersPass( crg::FramePassGroup & graph , crg::FramePass const * previousPass , RenderDevice const & device - , LightCache const & lights , CameraUbo const & cameraUbo - , FrustumClusters const & clusters ) + , FrustumClusters & clusters ) { auto & pass = graph.createPass( "AssignLightsToClusters" , [&clusters, &device]( crg::FramePass const & framePass @@ -337,22 +592,27 @@ namespace castor3d , device , clusters , crg::cp::Config{} - .groupCountX( clusters.getDimensions()->x ) - .groupCountY( clusters.getDimensions()->y ) - .groupCountZ( clusters.getDimensions()->z ) ); + .groupCountX( clusters.getDimensions()->x * clusters.getDimensions()->y * clusters.getDimensions()->z ) ); device.renderSystem.getEngine()->registerTimer( framePass.getFullName() , result->getTimer() ); return result; } ); pass.addDependency( *previousPass ); cameraUbo.createPassBinding( pass, dspclst::eCamera ); + auto & lights = clusters.getCamera().getScene()->getLightCache(); lights.createPassBinding( pass, dspclst::eLights ); clusters.getClustersUbo().createPassBinding( pass, dspclst::eClusters ); - createInputStoragePassBinding( pass, uint32_t( dspclst::eClustersAABB ), "C3D_ClustersAABB", clusters.getAabbBuffer().getBuffer(), 0u, ashes::WholeSize ); - createClearableOutputStorageBinding( pass, uint32_t( dspclst::ePointLightGridIndex ), "C3D_PointLightGridIndices", clusters.getPointLightIndexBuffer().getBuffer(), 0u, ashes::WholeSize ); - createClearableOutputStorageBinding( pass, uint32_t( dspclst::ePointLightGridCluster ), "C3D_PointLightGridClusters", clusters.getPointLightClusterBuffer().getBuffer(), 0u, ashes::WholeSize ); - createClearableOutputStorageBinding( pass, uint32_t( dspclst::eSpotLightGridIndex ), "C3D_SpotLightGridIndices", clusters.getSpotLightIndexBuffer().getBuffer(), 0u, ashes::WholeSize ); - createClearableOutputStorageBinding( pass, uint32_t( dspclst::eSpotLightGridCluster ), "C3D_LSpotightGridClusters", clusters.getSpotLightClusterBuffer().getBuffer(), 0u, ashes::WholeSize ); + createInputStoragePassBinding( pass, uint32_t( dspclst::eClustersAABB ), "C3D_ClustersAABB", clusters.getClustersAABBBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( dspclst::ePointLightIndex ), "C3D_PointLightClusterIndex", clusters.getPointLightClusterIndexBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( dspclst::ePointLightCluster ), "C3D_PointLightClusterGrid", clusters.getPointLightClusterGridBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( dspclst::eSpotLightIndex ), "C3D_SpotLightClusterIndex", clusters.getSpotLightClusterIndexBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( dspclst::eSpotLightCluster ), "C3D_SpotLightClusterGrid", clusters.getSpotLightClusterGridBuffer( ), 0u, ashes::WholeSize ); +#if C3D_DebugUseLightsBVH + createInputStoragePassBinding( pass, uint32_t( dspclst::ePointLightBVH ), "C3D_PointLightsBVH", clusters.getPointLightBVHBuffer(), 0u, ashes::WholeSize ); + createInputStoragePassBinding( pass, uint32_t( dspclst::eSpotLightBVH ), "C3D_SpotLightsBVH", clusters.getSpotLightBVHBuffer(), 0u, ashes::WholeSize ); + createInputStoragePassBinding( pass, uint32_t( dspclst::ePointLightIndices ), "C3D_PointLightIndices", clusters.getInputPointLightIndicesBuffer(), 0u, ashes::WholeSize ); + createInputStoragePassBinding( pass, uint32_t( dspclst::eSpotLightIndices ), "C3D_SpotLightIndices", clusters.getInputSpotLightIndicesBuffer(), 0u, ashes::WholeSize ); +#endif return pass; } diff --git a/source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp b/source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp new file mode 100644 index 0000000000..4c03577a58 --- /dev/null +++ b/source/Core/Castor3D/Render/Clustered/BuildLightsBVH.cpp @@ -0,0 +1,527 @@ +#include "Castor3D/Render/Clustered/BuildLightsBVH.hpp" + +#include "Castor3D/Engine.hpp" +#include "Castor3D/Cache/LightCache.hpp" +#include "Castor3D/Render/RenderDevice.hpp" +#include "Castor3D/Render/RenderSystem.hpp" +#include "Castor3D/Render/Clustered/FrustumClusters.hpp" +#include "Castor3D/Scene/Camera.hpp" +#include "Castor3D/Scene/Scene.hpp" +#include "Castor3D/Scene/Light/PointLight.hpp" +#include "Castor3D/Scene/Light/SpotLight.hpp" +#include "Castor3D/Shader/Program.hpp" +#include "Castor3D/Shader/Shaders/GlslAABB.hpp" +#include "Castor3D/Shader/Shaders/GlslAppendBuffer.hpp" +#include "Castor3D/Shader/Shaders/GlslClusteredLights.hpp" +#include "Castor3D/Shader/Shaders/GlslLight.hpp" +#include "Castor3D/Shader/Ubos/CameraUbo.hpp" +#include "Castor3D/Shader/Ubos/ClustersUbo.hpp" + +#include + +#include + +#include +#include + +namespace castor3d +{ + //********************************************************************************************* + + namespace lgtbvh + { + enum BindingPoints + { + eCamera, + eLights, + eClusters, + ePointLightIndices, + eSpotLightIndices, + ePointLightBVH, + eSpotLightBVH, + }; + + static uint32_t constexpr NumThreads = 32u * 16u; + + static ShaderPtr createShader( bool bottomLevel ) + { + sdw::ComputeWriter writer; + + auto c3d_numLevelNodes = writer.declConstantArray< sdw::UInt >( "c3d_numLevelNodes" + , { 1_u /* Level 0 ( 32^0 ) */ + , 32_u /* Level 1 ( 32^1 ) */ + , 1024_u /* Level 2 ( 32^2 ) */ + , 32768_u /* Level 3 ( 32^3 ) */ + , 1048576_u /* Level 4 ( 32^4 ) */ + , 33554432_u /* Level 5 ( 32^5 ) */ + , 1073741824_u /* Level 6 ( 32^6 ) */ } ); + auto c3d_firstNodeIndex = writer.declConstantArray< sdw::UInt >( "c3d_firstNodeIndex" + , { 0_u /* Level 0 */ + , 1_u /* Level 1 */ + , 33_u /* Level 2 */ + , 1057_u /* Level 3 */ + , 33825_u /* Level 4 */ + , 1082401_u /* Level 5 */ + , 34636833_u /* Level 6 */ } ); + + // Inputs + C3D_CameraEx( writer + , eCamera + , 0u + , bottomLevel ); + shader::LightsBuffer lights{ writer + , eLights + , 0u + , bottomLevel }; + C3D_Clusters( writer + , eClusters + , 0u ); + C3D_PointLightIndicesEx( writer + , ePointLightIndices + , 0u + , bottomLevel ); + C3D_SpotLightIndicesEx( writer + , eSpotLightIndices + , 0u + , bottomLevel ); + C3D_PointLightBVH( writer + , ePointLightBVH + , 0u ); + C3D_SpotLightBVH( writer + , eSpotLightBVH + , 0u ); + + sdw::PushConstantBuffer pcb{ writer + , "C3D_DrawData" + , "c3d_drawData" + , sdw::type::MemoryLayout::eC + , !bottomLevel }; + auto c3d_childLevel = pcb.declMember< sdw::UInt >( "childLevel", !bottomLevel ); + pcb.end(); + + auto gsAABBMin = writer.declSharedVariable< sdw::Vec4 >( "gsAABBMin", NumThreads ); + auto gsAABBMax = writer.declSharedVariable< sdw::Vec4 >( "gsAABBMax", NumThreads ); + + auto logStepReduction = writer.implementFunction< sdw::Void >( "logStepReduction" + , [&]( sdw::UInt groupIndex ) + { + auto reduceIndex = writer.declLocale( "reduceIndex" + , 32_u >> 1_u ); + auto mod32GroupIndex = writer.declLocale( "mod32GroupIndex" + , groupIndex % 32_u ); + + WHILE( writer, mod32GroupIndex < reduceIndex ) + { + gsAABBMin[groupIndex] = min( gsAABBMin[groupIndex], gsAABBMin[groupIndex + reduceIndex] ); + gsAABBMax[groupIndex] = max( gsAABBMax[groupIndex], gsAABBMax[groupIndex + reduceIndex] ); + + reduceIndex >>= 1_u; + } + ELIHW; + } + , sdw::InUInt{ writer, "groupIndex" } ); + + writer.implementMainT< sdw::VoidT >( NumThreads + , [&]( sdw::ComputeIn in ) + { + auto aabbMin = writer.declLocale< sdw::Vec4 >( "aabbMin" ); + auto aabbMax = writer.declLocale< sdw::Vec4 >( "aabbMax" ); + auto groupIndex = in.localInvocationIndex; + auto threadIndex = in.globalInvocationID.x(); + + if ( bottomLevel ) + { + // First compute BVH AABB for point lights. + auto leafIndex = writer.declLocale( "leafIndex" + , threadIndex ); + + IF( writer, leafIndex < c3d_clustersData.pointLightCount() ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , c3d_pointLightIndices[leafIndex] ); + auto point = writer.declLocale( "point" + , lights.getPointLight( lightOffset ) ); + auto vsPosition = writer.declLocale( "vsPosition" + , c3d_cameraData.worldToCurView( vec4( point.position(), 1.0_f ) ) ); + + aabbMin = vsPosition - point.base().farPlane(); + aabbMax = vsPosition + point.base().farPlane(); + } + ELSE + { + aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f ); + aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f ); + } + FI; + + gsAABBMin[groupIndex] = aabbMin; + gsAABBMax[groupIndex] = aabbMax; + + // Log-step reduction is performed warp-syncronous and thus does not require + // a group sync barrier. + logStepReduction( groupIndex ); + + // The first thread of each warp will write the AABB to global memory. + IF( writer, threadIndex % 32_u == 0_u ) + { + // Number of levels in the BVH + auto numLevels = writer.declLocale( "numLevels" + , c3d_clustersData.pointLightLevels() ); + // Offset of the node in the BVH at the last level of child nodes. + auto nodeOffset = writer.declLocale( "nodeOffset" + , threadIndex / 32_u ); + + IF( writer, numLevels > 0_u && nodeOffset < c3d_numLevelNodes[numLevels - 1_u] ) + { + auto nodeIndex = writer.declLocale( "nodeIndex" + , c3d_firstNodeIndex[numLevels - 1_u] + nodeOffset ); + c3d_pointLightBVH[nodeIndex].min() = gsAABBMin[groupIndex]; + c3d_pointLightBVH[nodeIndex].max() = gsAABBMax[groupIndex]; + } + FI; + } + FI; + + // Now compute BVH nodes for spot lights. + IF( writer, leafIndex < c3d_clustersData.spotLightCount() ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , c3d_spotLightIndices[leafIndex] ); + auto spot = writer.declLocale( "spot" + , lights.getSpotLight( lightOffset ) ); + auto vsPosition = writer.declLocale( "vsPosition" + , c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ) ); + + aabbMin = vsPosition - spot.base().farPlane(); + aabbMax = vsPosition + spot.base().farPlane(); + } + ELSE + { + aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f ); + aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f ); + } + FI; + + gsAABBMin[groupIndex] = aabbMin; + gsAABBMax[groupIndex] = aabbMax; + + logStepReduction( groupIndex ); + + // The first thread of each warp will write the AABB to global memory. + IF( writer, threadIndex % 32_u == 0_u ) + { + // Number of levels in the BVH + auto numLevels = writer.declLocale( "numLevels" + , c3d_clustersData.spotLightLevels() ); + // Offset of the node in the BVH at the last level of child nodes. + auto nodeOffset = writer.declLocale( "nodeOffset" + , threadIndex / 32_u ); + + IF( writer, numLevels > 0_u && nodeOffset < c3d_numLevelNodes[numLevels - 1_u] ) + { + auto nodeIndex = writer.declLocale( "nodeIndex" + , c3d_firstNodeIndex[numLevels - 1_u] + nodeOffset ); + + c3d_spotLightBVH[nodeIndex].min() = gsAABBMin[groupIndex]; + c3d_spotLightBVH[nodeIndex].max() = gsAABBMax[groupIndex]; + } + FI; + } + FI; + } + else + { + // First build upper BVH for point light BVH. + auto numLevels = writer.declLocale( "numLevels" + , c3d_clustersData.pointLightLevels() ); + auto childOffset = writer.declLocale( "childOffset" + , threadIndex ); + + IF( writer, c3d_childLevel < numLevels && childOffset < c3d_numLevelNodes[c3d_childLevel] ) + { + auto childIndex = writer.declLocale( "childIndex" + , c3d_firstNodeIndex[c3d_childLevel] + childOffset ); + + aabbMin = c3d_pointLightBVH[childIndex].min(); + aabbMax = c3d_pointLightBVH[childIndex].max(); + } + ELSE + { + aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f ); + aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f ); + } + FI; + + gsAABBMin[groupIndex] = aabbMin; + gsAABBMax[groupIndex] = aabbMax; + + // Log-step reduction is performed warp-syncronous and thus does not require + // a group sync barrier. + logStepReduction( groupIndex ); + + // The first thread of each warp will write the AABB to global memory. + IF( writer, threadIndex % 32_u == 0_u ) + { + auto nodeOffset = writer.declLocale( "nodeOffset" + , threadIndex / 32_u ); + + IF( writer, c3d_childLevel < numLevels && nodeOffset < c3d_numLevelNodes[c3d_childLevel - 1_u] ) + { + auto nodeIndex = writer.declLocale( "nodeIndex" + , c3d_firstNodeIndex[c3d_childLevel - 1_u] + nodeOffset ); + c3d_pointLightBVH[nodeIndex].min() = gsAABBMin[groupIndex]; + c3d_pointLightBVH[nodeIndex].max() = gsAABBMax[groupIndex]; + } + FI; + } + FI; + + // Now build upper BVH for spot light BVH. + numLevels = c3d_clustersData.spotLightLevels(); + + IF( writer, c3d_childLevel < numLevels && childOffset < c3d_numLevelNodes[c3d_childLevel] ) + { + auto childIndex = writer.declLocale( "childIndex" + , c3d_firstNodeIndex[c3d_childLevel] + childOffset ); + + aabbMin = c3d_spotLightBVH[childIndex].min(); + aabbMax = c3d_spotLightBVH[childIndex].max(); + } + ELSE + { + aabbMin = vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f ); + aabbMax = vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f ); + } + FI; + + gsAABBMin[groupIndex] = aabbMin; + gsAABBMax[groupIndex] = aabbMax; + + // Log-step reduction is performed warp-syncronous and thus does not require + // a group sync barrier. + logStepReduction( groupIndex ); + + // The first thread of each warp will write the AABB to global memory. + IF( writer, threadIndex % 32_u == 0_u ) + { + auto nodeOffset = writer.declLocale( "nodeOffset" + , threadIndex / 32_u ); + + IF( writer, c3d_childLevel < numLevels && nodeOffset < c3d_numLevelNodes[c3d_childLevel - 1_u] ) + { + auto nodeIndex = writer.declLocale( "nodeIndex" + , c3d_firstNodeIndex[c3d_childLevel - 1_u] + nodeOffset ); + c3d_spotLightBVH[nodeIndex].min() = gsAABBMin[groupIndex]; + c3d_spotLightBVH[nodeIndex].max() = gsAABBMax[groupIndex]; + } + FI; + } + FI; + } + } ); + return std::make_unique< ast::Shader >( std::move( writer.getShader() ) ); + } + + class FramePass + : public crg::RunnablePass + { + public: + FramePass( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph + , RenderDevice const & device + , FrustumClusters const & clusters ) + : crg::RunnablePass{ framePass + , context + , graph + , { [this]( uint32_t index ){ doInitialise( index ); } + , GetPipelineStateCallback( [](){ return crg::getPipelineState( VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); } ) + , [this]( crg::RecordContext & recContext, VkCommandBuffer cb, uint32_t i ){ doRecordInto( recContext, cb, i ); } + , GetPassIndexCallback( [](){ return 0u; } ) + , IsEnabledCallback( [this](){ return doIsEnabled(); } ) + , IsComputePassCallback( [](){ return true; } ) } + , crg::ru::Config{ 1u, true /* resettable */ } } + , m_lightCache{ clusters.getCamera().getScene()->getLightCache() } + , m_bottom{ framePass, context, graph, device, true, clusters } + , m_top{ framePass, context, graph, device, false, clusters } + { + } + + CRG_API void resetPipeline( crg::VkPipelineShaderStageCreateInfoArray config + , uint32_t index ) + { + resetCommandBuffer( index ); + m_bottom.pipeline.resetPipeline( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( m_bottom.createInfo ), index ); + m_top.pipeline.resetPipeline( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( m_top.createInfo ), index ); + doCreatePipeline( index, m_bottom ); + doCreatePipeline( index, m_top ); + reRecordCurrent(); + } + + private: + struct Pipeline + { + ShaderModule shader; + ashes::PipelineShaderStageCreateInfoArray createInfo; + crg::cp::ConfigData cpConfig; + crg::PipelineHolder pipeline; + + Pipeline( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph + , RenderDevice const & device + , bool bottomLevel + , FrustumClusters const & clusters ) + : shader{ VK_SHADER_STAGE_COMPUTE_BIT, "BuildLightsBVH", createShader( bottomLevel ) } + , createInfo{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, shader ) } } + , cpConfig{ crg::defaultV< uint32_t const * > + , &clusters.needsLightsUpdate() + , crg::getDefaultV< IsEnabledCallback >() + , crg::getDefaultV< RecordCallback >() + , crg::getDefaultV< RecordCallback >() + , 1u + , 1u + , 1u } + , pipeline{ framePass + , context + , graph + , crg::pp::Config{} + .program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( createInfo ) ) + .pushConstants( VkPushConstantRange{ VK_SHADER_STAGE_COMPUTE_BIT, 0u, 4u } ) + , VK_PIPELINE_BIND_POINT_COMPUTE + , 1u } + { + } + }; + + private: + LightCache const & m_lightCache; + Pipeline m_bottom; + Pipeline m_top; + + private: + void doInitialise( uint32_t index ) + { + m_bottom.pipeline.initialise(); + m_top.pipeline.initialise(); + doCreatePipeline( index, m_bottom ); + doCreatePipeline( index, m_top ); + } + + bool doIsEnabled()const + { + return ( m_bottom.cpConfig.isEnabled ? ( *m_bottom.cpConfig.isEnabled )() : false ) + || ( m_top.cpConfig.isEnabled ? ( *m_top.cpConfig.isEnabled )() : false ); + } + + void doRecordInto( crg::RecordContext & context + , VkCommandBuffer commandBuffer + , uint32_t index ) + { + // Build bottom level of the BVH. + auto pointLightsCount = m_lightCache.getLightsCount( LightType::ePoint ); + auto spoLightsCount = m_lightCache.getLightsCount( LightType::eSpot ); + auto maxLeaves = std::max( pointLightsCount, spoLightsCount ); + auto numThreadGroups = uint32_t( std::ceil( float( maxLeaves ) / float( 32u * 16u ) ) ); + m_bottom.pipeline.recordInto( context, commandBuffer, index ); + m_context.vkCmdDispatch( commandBuffer, numThreadGroups, 1u, 1u ); + uint32_t maxLevels = FrustumClusters::getNumLevels( maxLeaves ); + doBarriers( context, commandBuffer, 0 ); + + // Now build upper levels of the BVH. + if ( maxLevels > 1u ) + { + m_top.pipeline.recordInto( context, commandBuffer, index ); + + for ( uint32_t level = maxLevels - 1u; level > 0; --level ) + { + doBarriers( context, commandBuffer, 1 ); + m_context.vkCmdPushConstants( commandBuffer, m_top.pipeline.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0u, 4u, &level ); + uint32_t numChildNodes = FrustumClusters::getNumLevelNodes( level ); + numThreadGroups = uint32_t( std::ceil( float( numChildNodes ) / float( NumThreads ) ) ); + m_context.vkCmdDispatch( commandBuffer, numThreadGroups, 1u, 1u ); + } + } + + doBarriers( context, commandBuffer, 2 ); + } + + void doBarriers( crg::RecordContext & context + , VkCommandBuffer commandBuffer + , int idx ) + { + for ( auto & attach : m_pass.buffers ) + { + auto buffer = attach.buffer; + + if ( !attach.isNoTransition() + && attach.isStorageBuffer() + && attach.isClearableBuffer() ) + { + auto currentState = context.getAccessState( buffer.buffer.buffer + , buffer.range ); + context.memoryBarrier( commandBuffer + , buffer.buffer.buffer + , buffer.range + , currentState.access + , currentState.pipelineStage + , ( ( idx == 2 ) + ? crg::AccessState{ VK_ACCESS_SHADER_READ_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT } + : crg::AccessState{ VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT } ) + , true ); + } + } + } + + void doCreatePipeline( uint32_t index + , Pipeline & pipeline ) + { + auto & program = pipeline.pipeline.getProgram( index ); + VkComputePipelineCreateInfo createInfo{ VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO + , nullptr + , 0u + , program.front() + , pipeline.pipeline.getPipelineLayout() + , VkPipeline{} + , 0u }; + pipeline.pipeline.createPipeline( index, createInfo ); + } + }; + } + + //********************************************************************************************* + + crg::FramePass const & createBuildLightsBVHPass( crg::FramePassGroup & graph + , crg::FramePass const * previousPass + , RenderDevice const & device + , CameraUbo const & cameraUbo + , FrustumClusters & clusters ) + { + auto & pass = graph.createPass( "BuildLightsBVH" + , [&clusters, &device]( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph ) + { + auto result = std::make_unique< lgtbvh::FramePass >( framePass + , context + , graph + , device + , clusters ); + device.renderSystem.getEngine()->registerTimer( framePass.getFullName() + , result->getTimer() ); + return result; + } ); + pass.addDependency( *previousPass ); + cameraUbo.createPassBinding( pass, lgtbvh::eCamera ); + auto & lights = clusters.getCamera().getScene()->getLightCache(); + lights.createPassBinding( pass, lgtbvh::eLights ); + clusters.getClustersUbo().createPassBinding( pass, lgtbvh::eClusters ); + createInputStoragePassBinding( pass, uint32_t( lgtbvh::ePointLightIndices ), "C3D_PointLightIndices", clusters.getInputPointLightIndicesBuffer(), 0u, ashes::WholeSize ); + createInputStoragePassBinding( pass, uint32_t( lgtbvh::eSpotLightIndices ), "C3D_SpotLightIndices", clusters.getInputSpotLightIndicesBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( lgtbvh::ePointLightBVH ), "C3D_PointLightBVH", clusters.getPointLightBVHBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( lgtbvh::eSpotLightBVH ), "C3D_SpotLightBVH", clusters.getSpotLightBVHBuffer(), 0u, ashes::WholeSize ); + return pass; + } + + //********************************************************************************************* +} diff --git a/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp b/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp index c48d5d3b74..afc2e08506 100644 --- a/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp +++ b/source/Core/Castor3D/Render/Clustered/ComputeClustersAABB.cpp @@ -96,8 +96,7 @@ namespace castor3d writer.implementMainT< VoidT >( 1u, 1u, 1u , [&]( ComputeIn in ) { - auto clusterIndex3D = writer.declLocale( "clusterIndex3D" - , in.globalInvocationID ); + auto clusterIndex3D = in.globalInvocationID; auto clusterIndex1D = writer.declLocale( "clusterIndex1D" , c3d_clustersData.computeClusterIndex1D( clusterIndex3D ) ); @@ -207,7 +206,7 @@ namespace castor3d , crg::FramePass const * previousPass , RenderDevice const & device , CameraUbo const & cameraUbo - , FrustumClusters const & clusters ) + , FrustumClusters & clusters ) { auto & pass = graph.createPass( "ComputeClustersAABB" , [&clusters, &device]( crg::FramePass const & framePass @@ -230,7 +229,7 @@ namespace castor3d pass.addDependency( *previousPass ); cameraUbo.createPassBinding( pass, cptclsb::eCamera ); clusters.getClustersUbo().createPassBinding( pass, cptclsb::eClusters ); - createClearableOutputStorageBinding( pass, uint32_t( cptclsb::eClustersAABB ), "C3D_ClustersAABB", clusters.getAabbBuffer().getBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( cptclsb::eClustersAABB ), "C3D_ClustersAABB", clusters.getClustersAABBBuffer(), 0u, ashes::WholeSize ); return pass; } diff --git a/source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp b/source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp new file mode 100644 index 0000000000..46e83a1e9b --- /dev/null +++ b/source/Core/Castor3D/Render/Clustered/ComputeLightsMortonCode.cpp @@ -0,0 +1,263 @@ +#include "Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp" + +#include "Castor3D/Engine.hpp" +#include "Castor3D/Cache/LightCache.hpp" +#include "Castor3D/Render/RenderDevice.hpp" +#include "Castor3D/Render/RenderSystem.hpp" +#include "Castor3D/Render/Clustered/FrustumClusters.hpp" +#include "Castor3D/Scene/Camera.hpp" +#include "Castor3D/Scene/Scene.hpp" +#include "Castor3D/Scene/Light/PointLight.hpp" +#include "Castor3D/Scene/Light/SpotLight.hpp" +#include "Castor3D/Shader/Program.hpp" +#include "Castor3D/Shader/Shaders/GlslAABB.hpp" +#include "Castor3D/Shader/Shaders/GlslAppendBuffer.hpp" +#include "Castor3D/Shader/Shaders/GlslClusteredLights.hpp" +#include "Castor3D/Shader/Shaders/GlslLight.hpp" +#include "Castor3D/Shader/Shaders/GlslUtils.hpp" +#include "Castor3D/Shader/Ubos/CameraUbo.hpp" +#include "Castor3D/Shader/Ubos/ClustersUbo.hpp" + +#include + +#include + +#include +#include + +namespace castor3d +{ + //********************************************************************************************* + + namespace cmpmrt + { + enum BindingPoints + { + eCamera, + eLights, + eClusters, + eLightsAABB, + ePointLightMortonCodes, + eSpotLightMortonCodes, + ePointLightIndices, + eSpotLightIndices, + }; + + static uint32_t constexpr NumThreads = 1024u; + // Generate 3k-bit morton codes. + // For k = 10, morton codes will be 30-bits. + static u32 constexpr kBitMortonCode = 10u; + // To quantize the light's position, the light's view space position will + // be normalized based on the AABB that encompases all lights. + // The normalized value will then be scaled based on the k-bits of the morton + // code and the resulting bits of the x, y, and z components will be interleved + // to produce the final morton code. + static float constexpr coordinateScale = float( ( 1u << kBitMortonCode ) - 1u ); // This is equivalent to 2^k-1 which results in a value that when scaled by 1 will produce a number that is exactly k bits. + + static ShaderPtr createShader() + { + sdw::ComputeWriter writer; + + // Inputs + C3D_Camera( writer + , eCamera + , 0u ); + shader::LightsBuffer lights{ writer + , eLights + , 0u }; + C3D_Clusters( writer + , eClusters + , 0u ); + C3D_LightsAABB( writer + , eLightsAABB + , 0u ); + C3D_PointLightMortonCodes( writer + , ePointLightMortonCodes + , 0u ); + C3D_SpotLightMortonCodes( writer + , eSpotLightMortonCodes + , 0u ); + C3D_PointLightIndices( writer + , ePointLightIndices + , 0u ); + C3D_SpotLightIndices( writer + , eSpotLightIndices + , 0u ); + + auto gsAABB = writer.declSharedVariable< shader::AABB >( "gsAABB" ); + auto gsAABBRange = writer.declSharedVariable< sdw::Vec4 >( "gsAABBRange" ); + + // Produce a 3k-bit morton code from a quantized coordinate. + auto getMortonCode = writer.implementFunction< sdw::UInt >( "getMortonCode" + , [&]( sdw::UVec3 quantizedCoord ) + { + auto mortonCode = writer.declLocale( "mortonCode", 0_u ); + auto bitMask = 1u; + auto bitShift = 0u; + auto kBits = 1u << kBitMortonCode; + + while ( bitMask < kBits ) + { + // Interleave the bits of the X, Y, and Z coordinates to produce the final Morton code. + mortonCode |= ( quantizedCoord.x() & bitMask ) << ( bitShift + 0_u ); + mortonCode |= ( quantizedCoord.y() & bitMask ) << ( bitShift + 1_u ); + mortonCode |= ( quantizedCoord.z() & bitMask ) << ( bitShift + 2_u ); + + bitMask <<= 1u; + bitShift += 2u; + } + + return mortonCode; + } + , sdw::InUVec3{ writer, "quantizedCoord" } ); + + writer.implementMainT< sdw::VoidT >( NumThreads + , [&]( sdw::ComputeIn in ) + { + auto groupIndex = in.localInvocationIndex; + + IF( writer, groupIndex == 0_u ) + { + gsAABB = c3d_lightsAABB[0_u]; + // Compute the recipocal of the range of the AABB. + // This is used to normalize the light coordinates within the bounds of the AABB. + gsAABBRange = vec4( 1.0_f ) / ( gsAABB.max() - gsAABB.min() ); + } + FI; + + shader::groupMemoryBarrierWithGroupSync( writer ); + auto coordScale = vec4( sdw::Float{ coordinateScale } ); + + auto threadIndex = in.globalInvocationID.x(); + + IF( writer, threadIndex < c3d_clustersData.pointLightCount() ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , lights.getDirectionalsEnd() + threadIndex * PointLight::LightDataComponents ); + auto point = writer.declLocale( "point" + , lights.getPointLight( lightOffset ) ); + auto vsPosition = writer.declLocale( "vsPosition" + , c3d_cameraData.worldToCurView( vec4( point.position(), 1.0_f ) ) ); + // Normalize and scale the position of the light to produce the quantized coordinate. + auto quantized = writer.declLocale( "quantized" + , sdw::uvec4( ( vsPosition - gsAABB.min() ) * gsAABBRange * coordScale ) ); + + c3d_pointLightMortonCodes[threadIndex] = getMortonCode( quantized.xyz() ); + c3d_pointLightIndices[threadIndex] = lightOffset; + } + FI; + + IF( writer, threadIndex < c3d_clustersData.spotLightCount() ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , lights.getPointsEnd() + threadIndex * SpotLight::LightDataComponents ); + auto spot = writer.declLocale( "spot" + , lights.getSpotLight( lightOffset ) ); + auto vsPosition = writer.declLocale( "vsPosition" + , c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ) ); + // Normalize and scale the position of the light to produce the quantized coordinate. + auto quantized = writer.declLocale( "quantized" + , sdw::uvec4( ( vsPosition - gsAABB.min() ) * gsAABBRange * coordScale ) ); + + c3d_spotLightMortonCodes[threadIndex] = getMortonCode( quantized.xyz() ); + c3d_spotLightIndices[threadIndex] = lightOffset; + } + FI; + } ); + return std::make_unique< ast::Shader >( std::move( writer.getShader() ) ); + } + + class FramePass + : private castor::DataHolderT< ShaderModule > + , private castor::DataHolderT< ashes::PipelineShaderStageCreateInfoArray > + , public crg::ComputePass + { + using ShaderHolder = DataHolderT< ShaderModule >; + using CreateInfoHolder = DataHolderT< ashes::PipelineShaderStageCreateInfoArray >; + + public: + FramePass( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph + , RenderDevice const & device + , crg::cp::Config config ) + : ShaderHolder{ ShaderModule{ VK_SHADER_STAGE_COMPUTE_BIT, "ComputeLightsMortonCode", createShader() } } + , CreateInfoHolder{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, ShaderHolder::getData() ) } } + , crg::ComputePass{framePass + , context + , graph + , crg::ru::Config{} + , config + .program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( CreateInfoHolder::getData() ) ) + .end( RecordCallback{ [this]( crg::RecordContext & ctx, VkCommandBuffer cb, uint32_t idx ) { doPostRecord( ctx, cb, idx ); } } ) } + { + } + + private: + void doPostRecord( crg::RecordContext & context + , VkCommandBuffer commandBuffer + , uint32_t index ) + { + for ( auto & attach : m_pass.buffers ) + { + auto buffer = attach.buffer; + + if ( !attach.isNoTransition() + && attach.isStorageBuffer() + && attach.isClearableBuffer() ) + { + auto currentState = context.getAccessState( buffer.buffer.buffer + , buffer.range ); + context.memoryBarrier( commandBuffer + , buffer.buffer.buffer + , buffer.range + , currentState.access + , currentState.pipelineStage + , { VK_ACCESS_SHADER_READ_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT } ); + } + } + } + }; + } + + //********************************************************************************************* + + crg::FramePass const & createComputeLightsMortonCodePass( crg::FramePassGroup & graph + , crg::FramePass const * previousPass + , RenderDevice const & device + , CameraUbo const & cameraUbo + , FrustumClusters & clusters ) + { + auto & pass = graph.createPass( "ComputeLightsMortonCode" + , [&clusters, &device]( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph ) + { + uint32_t numThreadGroups = uint32_t( std::ceil( float( MaxLightsCount ) / 1024.0f ) ); + auto result = std::make_unique< cmpmrt::FramePass >( framePass + , context + , graph + , device + , crg::cp::Config{} + .groupCountX( numThreadGroups ) + .enabled( &clusters.needsClustersUpdate() ) ); + device.renderSystem.getEngine()->registerTimer( framePass.getFullName() + , result->getTimer() ); + return result; + } ); + pass.addDependency( *previousPass ); + cameraUbo.createPassBinding( pass, cmpmrt::eCamera ); + auto & lights = clusters.getCamera().getScene()->getLightCache(); + lights.createPassBinding( pass, cmpmrt::eLights ); + clusters.getClustersUbo().createPassBinding( pass, cmpmrt::eClusters ); + createInputStoragePassBinding( pass, uint32_t( cmpmrt::eLightsAABB ), "C3D_LightsAABB", clusters.getLightsAABBBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::ePointLightMortonCodes ), "C3D_PointLightMortonCodes", clusters.getOutputPointLightMortonCodesBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::eSpotLightMortonCodes ), "C3D_SpotLightMortonCodes", clusters.getOutputSpotLightMortonCodesBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::ePointLightIndices ), "C3D_PointLightIndices", clusters.getOutputPointLightIndicesBuffer(), 0u, ashes::WholeSize ); + createClearableOutputStorageBinding( pass, uint32_t( cmpmrt::eSpotLightIndices ), "C3D_SpotLightIndices", clusters.getOutputSpotLightIndicesBuffer(), 0u, ashes::WholeSize ); + clusters.swapLightMortonIndicesIO(); + return pass; + } + + //********************************************************************************************* +} diff --git a/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp b/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp index 232a553e3b..b0785a357a 100644 --- a/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp +++ b/source/Core/Castor3D/Render/Clustered/FrustumClusters.cpp @@ -1,9 +1,15 @@ #include "Castor3D/Render/Clustered/FrustumClusters.hpp" +#include "Castor3D/DebugDefines.hpp" #include "Castor3D/Engine.hpp" #include "Castor3D/Buffer/GpuBufferPool.hpp" #include "Castor3D/Cache/LightCache.hpp" #include "Castor3D/Render/RenderDevice.hpp" +#include "Castor3D/Render/Clustered/AssignLightsToClusters.hpp" +#include "Castor3D/Render/Clustered/BuildLightsBVH.hpp" +#include "Castor3D/Render/Clustered/ComputeClustersAABB.hpp" +#include "Castor3D/Render/Clustered/ComputeLightsMortonCode.hpp" +#include "Castor3D/Render/Clustered/ReduceLightsAABB.hpp" #include "Castor3D/Scene/Camera.hpp" #include "Castor3D/Scene/Scene.hpp" #include "Castor3D/Scene/Light/DirectionalLight.hpp" @@ -11,10 +17,39 @@ #include "Castor3D/Scene/Light/PointLight.hpp" #include "Castor3D/Scene/Light/SpotLight.hpp" +#include + CU_ImplementSmartPtr( castor3d, FrustumClusters ) namespace castor3d { + //********************************************************************************************* + + namespace frscls + { + template< typename DataT > + void updateBuffer( RenderDevice const & device + , VkDeviceSize elementCount + , std::string const & debugName + , ashes::BufferBasePtr & buffer + , std::vector< ashes::BufferBasePtr > & toDelete ) + { + if ( buffer ) + { + toDelete.emplace_back( std::move( buffer ) ); + } + + buffer = makeBufferBase( device + , elementCount * sizeof( DataT ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_" + debugName ); + } + + } + + //********************************************************************************************* + FrustumClusters::FrustumClusters( RenderDevice const & device , Camera const & camera ) : m_device{ device } @@ -25,6 +60,63 @@ namespace castor3d , m_cameraView{ m_clustersDirty, {} } , m_nearK{ m_clustersDirty, 0.0f } , m_clustersUbo{ m_device } +#if C3D_DebugUseLightsBVH + , m_lightsAABBBuffer{ makeBufferBase( m_device + , sizeof( AABB ) * MaxLightsCount + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_LightsAABB" ) } + , m_pointMortonCodesBuffers{ { makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_PointLightMortonCodesA" ) + , makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_PointLightMortonCodesB" ) } } + , m_spotMortonCodesBuffers{ { makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_SpotLightMortonCodesA" ) + , makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_SpotLightMortonCodesB" ) } } + , m_pointIndicesBuffers{ { makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_PointLightIndicesA" ) + , makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_PointLightIndicesB" ) } } + , m_spotIndicesBuffers{ { makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_SpotLightIndicesA" ) + , makeBufferBase( m_device + , MaxLightsCount * sizeof( u32 ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_SpotLightIndicesB" ) } } + , m_pointBVHBuffer{ makeBufferBase( m_device + , sizeof( AABB ) * getNumNodes( MaxLightsCount ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_PointLightBVH" ) } + , m_spotBVHBuffer{ makeBufferBase( m_device + , sizeof( AABB ) * getNumNodes( MaxLightsCount ) + , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT + , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + , "C3D_SpotLightBVH" ) } +#endif { doUpdate(); } @@ -38,7 +130,9 @@ namespace castor3d m_clustersUbo.cpuUpdate( m_dimensions.value() , m_camera.getNear() , m_clusterSize.value() - , m_nearK.value() ); + , m_nearK.value() + , lightCache.getLightsCount( LightType::ePoint ) + , lightCache.getLightsCount( LightType::eSpot ) ); m_lightsDirty = lightCache.hasClusteredLights() && ( m_clustersDirty || lightCache.isDirty() ); m_first = m_camera.getEngine()->areUpdateOptimisationsEnabled() @@ -46,19 +140,90 @@ namespace castor3d : true; } + crg::FramePass const & FrustumClusters::createFramePasses( crg::FramePassGroup & parentGraph + , crg::FramePass const * previousPass + , CameraUbo const & cameraUbo ) + { + auto & graph = parentGraph.createPassGroup( "Clusters" ); + auto lastPass = &createComputeClustersAABBPass( graph, previousPass + , m_device, cameraUbo, *this ); +#if C3D_DebugUseLightsBVH + lastPass = &createReduceLightsAABBPass( graph, lastPass + , m_device, cameraUbo, *this ); + lastPass = &createComputeLightsMortonCodePass( graph, lastPass + , m_device, cameraUbo, *this ); + lastPass = &createBuildLightsBVHPass( graph, lastPass + , m_device, cameraUbo, *this ); +#endif + return createAssignLightsToClustersPass( graph, lastPass + , m_device, cameraUbo, *this ); + } + + uint32_t FrustumClusters::getNumLevels( uint32_t numLeaves ) + { + static const float log32f = std::log( 32.0f ); + uint32_t numLevels = 0; + + if ( numLeaves > 0 ) + { + numLevels = uint32_t( std::ceil( std::log( numLeaves ) / log32f ) ); + } + + return numLevels; + } + + uint32_t FrustumClusters::getNumLevelNodes( uint32_t level ) + { + static const uint32_t numLevelNodes[] = + { + 1, // 1st level (32^0) + 32, // 2nd level (32^1) + 1024, // 3rd level (32^2) + 32768, // 4th level (32^3) + 1048576, // 5th level (32^4) + 33554432, // 6th level (32^5) + }; + return numLevelNodes[level]; + } + + uint32_t FrustumClusters::getNumNodes( uint32_t numLeaves ) + { + static const uint32_t numBVHNodes[] = + { + 1, // 1 level =32^0 + 33, // 2 levels +32^1 + 1057, // 3 levels +32^2 + 33825, // 4 levels +32^3 + 1082401, // 5 levels +32^4 + 34636833, // 6 levels +32^5 + }; + + uint32_t numLevels = getNumLevels( numLeaves ); + uint32_t numNodes = 0; + + if ( numLevels > 0 && numLevels < castor::getCountOf( numBVHNodes ) ) + { + numNodes = numBVHNodes[numLevels - 1]; + } + + return numNodes; + } + void FrustumClusters::doUpdate() { + m_toDelete.clear(); + // The half-angle of the field of view in the Y-direction. auto fieldOfViewY = m_camera.getFovY() * 0.5f; f32 zNear = m_camera.getNear(); f32 zFar = m_camera.getFar(); u32 clusterSize = m_clusterSize.value(); - auto size = getSafeBandedSize( m_camera.getSize() ); + auto renderSize = getSafeBandedSize( m_camera.getSize() ); // Number of clusters in the screen X direction. - u32 clusterDimX = u32( std::ceil( f32( size->x ) / f32( clusterSize ) ) ); + u32 clusterDimX = u32( std::ceil( f32( renderSize->x ) / f32( clusterSize ) ) ); // Number of clusters in the screen Y direction. - u32 clusterDimY = u32( std::ceil( f32( size->y ) / f32( clusterSize ) ) ); + u32 clusterDimY = u32( std::ceil( f32( renderSize->y ) / f32( clusterSize ) ) ); // The depth of the cluster grid during clustered rendering is dependent on the // number of clusters subdivisions in the screen Y direction. @@ -73,36 +238,20 @@ namespace castor3d m_dimensions = { clusterDimX, clusterDimY, clusterDimZ }; m_cameraProjection = m_camera.getProjection( true ); m_cameraView = m_camera.getView(); + auto cellCount = clusterDimX * clusterDimY * clusterDimZ; - if ( !m_aabbBuffer ) + if ( !m_aabbBuffer + || m_aabbBuffer->getSize() < cellCount * sizeof( AABB ) ) { - constexpr usize maxCellCount = usize( 32u ) * 32u * 128u; - constexpr usize maxIndices = maxCellCount * MaxLightsPerCluster; - m_aabbBuffer = makeBuffer< AABB >( m_device - , maxCellCount - , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT - , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT - , "C3D_ClustersAABB" ); - m_pointIndexBuffer = makeBuffer< u32 >( m_device - , maxIndices - , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT - , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT - , "C3D_PointClustersIndex" ); - m_pointClusterBuffer = makeBuffer< castor::Point2ui >( m_device - , maxCellCount - , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT - , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT - , "C3D_PointClustersGrid" ); - m_spotIndexBuffer = makeBuffer< u32 >( m_device - , maxIndices - , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT - , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT - , "C3D_SpotClustersIndex" ); - m_spotClusterBuffer = makeBuffer< castor::Point2ui >( m_device - , maxCellCount - , VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT - , VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT - , "C3D_SpotClustersGrid" ); + auto indexCount = cellCount * MaxLightsPerCluster; + frscls::updateBuffer< AABB >( m_device, cellCount, "ClustersAABB", m_aabbBuffer, m_toDelete ); + frscls::updateBuffer< castor::Point2ui >( m_device, cellCount, "PointLightClusterGrid", m_pointLightClusterGridBuffer, m_toDelete ); + frscls::updateBuffer< castor::Point2ui >( m_device, cellCount, "SpotLightClusterGrid", m_spotLightClusterGridBuffer, m_toDelete ); + frscls::updateBuffer< u32 >( m_device, indexCount, "PointLightClusterIndex", m_pointLightClusterIndexBuffer, m_toDelete ); + frscls::updateBuffer< u32 >( m_device, indexCount, "SpottLightClusterIndex", m_spotLightClusterIndexBuffer, m_toDelete ); + onClusterBuffersChanged( *this ); } } + + //********************************************************************************************* } diff --git a/source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp b/source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp new file mode 100644 index 0000000000..53c7b70987 --- /dev/null +++ b/source/Core/Castor3D/Render/Clustered/ReduceLightsAABB.cpp @@ -0,0 +1,408 @@ +#include "Castor3D/Render/Clustered/ReduceLightsAABB.hpp" + +#include "Castor3D/Engine.hpp" +#include "Castor3D/Cache/LightCache.hpp" +#include "Castor3D/Render/RenderDevice.hpp" +#include "Castor3D/Render/RenderSystem.hpp" +#include "Castor3D/Render/Clustered/FrustumClusters.hpp" +#include "Castor3D/Scene/Camera.hpp" +#include "Castor3D/Scene/Scene.hpp" +#include "Castor3D/Scene/Light/PointLight.hpp" +#include "Castor3D/Scene/Light/SpotLight.hpp" +#include "Castor3D/Shader/Program.hpp" +#include "Castor3D/Shader/Shaders/GlslAABB.hpp" +#include "Castor3D/Shader/Shaders/GlslAppendBuffer.hpp" +#include "Castor3D/Shader/Shaders/GlslClusteredLights.hpp" +#include "Castor3D/Shader/Shaders/GlslLight.hpp" +#include "Castor3D/Shader/Shaders/GlslUtils.hpp" +#include "Castor3D/Shader/Ubos/CameraUbo.hpp" +#include "Castor3D/Shader/Ubos/ClustersUbo.hpp" + +#include + +#include + +#include +#include + +#define C3D_DebugEnableWarpOptimisation 0 + +namespace castor3d +{ + //********************************************************************************************* + + namespace rdclgb + { + enum BindingPoints + { + eCamera, + eLights, + eClusters, + eLightsAABB, + }; + + static uint32_t constexpr NumThreads = 512u; + + static ShaderPtr createShader( bool first ) + { + sdw::ComputeWriter writer; + + // Inputs + C3D_CameraEx( writer + , eCamera + , 0u + , first ); + shader::LightsBuffer lights{ writer + , eLights + , 0u + , first }; + C3D_ClustersEx( writer + , eClusters + , 0u + , first ); + C3D_LightsAABB( writer + , eLightsAABB + , 0u ); + + sdw::PushConstantBuffer pcb{ writer, "C3D_DispatchData", "c3d_dispatchData" }; + auto c3d_numThreadGroups = pcb.declMember< sdw::UInt >( "numThreadGroups" ); + auto c3d_reduceNumElements = pcb.declMember< sdw::UInt >( "reduceNumElements" ); + pcb.end(); + + auto gsAABBMin = writer.declSharedVariable< sdw::Vec4 >( "gsAABBMin", NumThreads ); + auto gsAABBMax = writer.declSharedVariable< sdw::Vec4 >( "gsAABBMax", NumThreads ); + + // Perform log-step reduction in group shared memory. + auto logStepReduction = [&]( sdw::UInt const & groupIndex + , sdw::UInt const & groupID ) + { + // If we can assume that NUM_THREADS is a power of 2, we can compute + // the reduction index by performing a bit shift. This is equivalent to + // halving the number of values (and threads) that must perform the reduction + // operation. + auto reduceIndex = NumThreads >> 1u; + +#if C3D_DebugEnableWarpOptimisation + while ( reduceIndex > 32u ) +#else + while ( reduceIndex > 1u ) +#endif + { + IF( writer, groupIndex < reduceIndex ) + { + gsAABBMin[groupIndex] = min( gsAABBMin[groupIndex], gsAABBMin[groupIndex + reduceIndex] ); + gsAABBMax[groupIndex] = max( gsAABBMax[groupIndex], gsAABBMax[groupIndex + reduceIndex] ); + } + FI; + + // Sync group shared memory writes. + shader::groupMemoryBarrierWithGroupSync( writer ); + + // Halve the number of threads that participate in the reduction. + reduceIndex >>= 1u; + } + +#if C3D_DebugEnableWarpOptimisation + // Within a warp (of 32 threads), instructions are warp-synchronous + // and the GroupMemoryBarrierWithGroupSync() is no longer needed to ensure + // the previous writes to groups shared memory have completed. + // Source: DirectCompute Optimizations and Best Practices (2010), Eric Young. + // Source: The CUDA Handbook (2013), Nicholas Wilt + IF( writer, groupIndex < 32_u ) + { + while ( writer, reduceIndex > 0u ) + { + // To avoid out-of-bounds memory access, the number of threads in the + // group must be at least 2x the reduce index. For example, the + // thread at index 31 will access elements 31 and 63 so the size of the thread group + // must be at least 64. + if ( NumThreads >= ( reduceIndex << 1u ) ) + { + gsAABBMin[groupIndex] = min( gsAABBMin[groupIndex], gsAABBMin[groupIndex + reduceIndex] ); + gsAABBMax[groupIndex] = max( gsAABBMax[groupIndex], gsAABBMax[groupIndex + reduceIndex] ); + } + + reduceIndex >>= 1u; + } + + IF( writer, groupIndex == 0_u ) + { + c3d_lightsAABB[groupID].min() = gsAABBMin[groupIndex]; + c3d_lightsAABB[groupID].max() = gsAABBMax[groupIndex]; + } + FI; + } + FI; +#else + IF( writer, groupIndex == 0_u ) + { + c3d_lightsAABB[groupID].min() = min( gsAABBMin[groupIndex], gsAABBMin[groupIndex + reduceIndex] ); + c3d_lightsAABB[groupID].max() = max( gsAABBMax[groupIndex], gsAABBMax[groupIndex + reduceIndex] ); + } + FI; +#endif + }; + + writer.implementMainT< sdw::VoidT >( NumThreads + , [&]( sdw::ComputeIn in ) + { + auto groupIndex = in.localInvocationIndex; + auto groupID = in.workGroupID.x(); + auto threadIndex = in.globalInvocationID.x(); + + auto aabbMin = writer.declLocale( "aabbMin" + , vec4( sdw::Float{ FLT_MAX }, FLT_MAX, FLT_MAX, 1.0f ) ); + auto aabbMax = writer.declLocale( "aabbMax" + , vec4( sdw::Float{ -FLT_MAX }, -FLT_MAX, -FLT_MAX, 1.0f ) ); + + if ( first ) + { + // The 1st pass of the reduction operates on the light buffers. + // First compute point lights AABB. + FOR( writer, sdw::UInt, i, threadIndex, i < c3d_clustersData.pointLightCount(), i += c3d_numThreadGroups * NumThreads ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , lights.getDirectionalsEnd() + i * PointLight::LightDataComponents ); + auto point = writer.declLocale( "point" + , lights.getPointLight( lightOffset ) ); + auto vsPosition = writer.declLocale( "vsPosition" + , c3d_cameraData.worldToCurView( vec4( point.position(), 1.0_f ) ) ); + + aabbMin = min( aabbMin, vsPosition - point.base().farPlane() ); + aabbMax = max( aabbMax, vsPosition + point.base().farPlane() ); + } + ROF; + + // Next, expand AABB for spot lights. + FOR( writer, sdw::UInt, i, threadIndex, i < c3d_clustersData.spotLightCount(), i += c3d_numThreadGroups * NumThreads ) + { + auto lightOffset = writer.declLocale( "lightOffset" + , lights.getPointsEnd() + i * SpotLight::LightDataComponents ); + auto spot = writer.declLocale( "spot" + , lights.getSpotLight( lightOffset ) ); + auto vsPosition = writer.declLocale( "vsPosition" + , c3d_cameraData.worldToCurView( vec4( spot.position(), 1.0_f ) ) ); + + aabbMin = min( aabbMin, vsPosition - spot.base().farPlane() ); + aabbMax = max( aabbMax, vsPosition + spot.base().farPlane() ); + } + ROF; + + gsAABBMin[groupIndex] = aabbMin; + gsAABBMax[groupIndex] = aabbMax; + + // Sync group shared memory writes. + shader::groupMemoryBarrierWithGroupSync( writer ); + + // Perform log-step reduction to allow each thread group in the dispatch + // to reduce to a single element. + logStepReduction( groupIndex, groupID ); + } + else + { + // The subsequent passes of the reduction operate on the global AABB computed + // in previous pass. + // This step is repeated until we are reduced to a single thread group. + FOR( writer, sdw::UInt, i, groupIndex, i < c3d_reduceNumElements, i += NumThreads * c3d_numThreadGroups ) + { + aabbMin = min( aabbMin, c3d_lightsAABB[i].min() ); + aabbMax = max( aabbMax, c3d_lightsAABB[i].max() ); + } + ROF; + + gsAABBMin[groupIndex] = aabbMin; + gsAABBMax[groupIndex] = aabbMax; + + // Sync group shared memory writes. + shader::groupMemoryBarrierWithGroupSync( writer ); + + // Perform log-step reduction to allow each thread group in the dispatch + // to reduce to a single element. If there was only a single thread group + // in this dispatch, then this will reduce to a single element. + logStepReduction( groupIndex, groupID ); + } + } ); + return std::make_unique< ast::Shader >( std::move( writer.getShader() ) ); + } + + class FramePass + : public crg::RunnablePass + { + public: + FramePass( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph + , RenderDevice const & device + , FrustumClusters const & clusters ) + : crg::RunnablePass{ framePass + , context + , graph + , { [this]( uint32_t index ){ doInitialise( index ); } + , GetPipelineStateCallback( [](){ return crg::getPipelineState( VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); } ) + , [this]( crg::RecordContext & recContext, VkCommandBuffer cb, uint32_t i ){ doRecordInto( recContext, cb, i ); } + , GetPassIndexCallback( [](){ return 0u; } ) + , IsEnabledCallback( [this](){ return doIsEnabled(); } ) + , IsComputePassCallback( [](){ return true; } ) } + , crg::ru::Config{ 1u, true /* resettable */ } } + , m_lightCache{ clusters.getCamera().getScene()->getLightCache() } + , m_first{ framePass, context, graph, device, true, clusters } + , m_second{ framePass, context, graph, device, false, clusters } + { + } + + CRG_API void resetPipeline( crg::VkPipelineShaderStageCreateInfoArray config + , uint32_t index ) + { + resetCommandBuffer( index ); + m_first.pipeline.resetPipeline( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( m_first.createInfo ), index ); + m_second.pipeline.resetPipeline( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( m_second.createInfo ), index ); + doCreatePipeline( index, m_first ); + doCreatePipeline( index, m_second ); + reRecordCurrent(); + } + + private: + struct Pipeline + { + ShaderModule shader; + ashes::PipelineShaderStageCreateInfoArray createInfo; + crg::cp::ConfigData cpConfig; + crg::PipelineHolder pipeline; + + Pipeline( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph + , RenderDevice const & device + , bool first + , FrustumClusters const & clusters ) + : shader{ VK_SHADER_STAGE_COMPUTE_BIT, "ReduceLightsAABB" + ( first ? std::string{ "/First" } : std::string{ "/Second" } ), createShader( first ) } + , createInfo{ ashes::PipelineShaderStageCreateInfoArray{ makeShaderState( device, shader ) } } + , cpConfig{ crg::defaultV< uint32_t const * > + , &clusters.needsLightsUpdate() + , crg::getDefaultV< IsEnabledCallback >() + , crg::getDefaultV< RecordCallback >() + , crg::getDefaultV< RecordCallback >() + , 1u + , 1u + , 1u } + , pipeline{ framePass + , context + , graph + , crg::pp::Config{} + .program( ashes::makeVkArray< VkPipelineShaderStageCreateInfo >( createInfo ) ) + .pushConstants( VkPushConstantRange{ VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u } ) + , VK_PIPELINE_BIND_POINT_COMPUTE + , 1u } + { + } + }; + + private: + LightCache const & m_lightCache; + Pipeline m_first; + Pipeline m_second; + + private: + void doInitialise( uint32_t index ) + { + m_first.pipeline.initialise(); + m_second.pipeline.initialise(); + doCreatePipeline( index, m_first ); + doCreatePipeline( index, m_second ); + } + + bool doIsEnabled()const + { + return ( m_first.cpConfig.isEnabled ? ( *m_first.cpConfig.isEnabled )() : false ) + || ( m_second.cpConfig.isEnabled ? ( *m_second.cpConfig.isEnabled )() : false ); + } + + void doRecordInto( crg::RecordContext & context + , VkCommandBuffer commandBuffer + , uint32_t index ) + { + struct + { + uint32_t numThreadGroups{}; + uint32_t reduceNumElements{}; + } dispatchData; + + auto pointLightsCount = m_lightCache.getLightsCount( LightType::ePoint ); + auto spoLightsCount = m_lightCache.getLightsCount( LightType::eSpot ); + auto maxLightsCount = std::max( pointLightsCount, spoLightsCount ); + + // Don't dispatch more than 512 thread groups. The reduction algorithm depends on the + // number of thread groups to be no more than 512. The buffer which stores the reduced AABB is sized + // for a maximum of 512 thread groups. + dispatchData.numThreadGroups = std::min( 512u + , uint32_t( std::ceil( float( maxLightsCount ) / 512.0f ) ) ); + + // First pass + m_first.pipeline.recordInto( context, commandBuffer, index ); + m_context.vkCmdPushConstants( commandBuffer, m_first.pipeline.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u, &dispatchData ); + m_context.vkCmdDispatch( commandBuffer, dispatchData.numThreadGroups, 1u, 1u ); + + if ( dispatchData.numThreadGroups > 1u ) + { + // In the first pass, the number of lights determines the number of + // elements to be reduced. + // In the second pass, the number of elements to be reduced is the + // number of thread groups from the first pass. + dispatchData.reduceNumElements = dispatchData.numThreadGroups; + dispatchData.numThreadGroups = 1u; + // Second pass + m_second.pipeline.recordInto( context, commandBuffer, index ); + m_context.vkCmdPushConstants( commandBuffer, m_second.pipeline.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0u, 8u, &dispatchData ); + m_context.vkCmdDispatch( commandBuffer, dispatchData.numThreadGroups, 1u, 1u ); + } + } + + void doCreatePipeline( uint32_t index + , Pipeline & pipeline ) + { + auto & program = pipeline.pipeline.getProgram( index ); + VkComputePipelineCreateInfo createInfo{ VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO + , nullptr + , 0u + , program.front() + , pipeline.pipeline.getPipelineLayout() + , VkPipeline{} + , 0u }; + pipeline.pipeline.createPipeline( index, createInfo ); + } + }; + } + + //********************************************************************************************* + + crg::FramePass const & createReduceLightsAABBPass( crg::FramePassGroup & graph + , crg::FramePass const * previousPass + , RenderDevice const & device + , CameraUbo const & cameraUbo + , FrustumClusters & clusters ) + { + auto & pass = graph.createPass( "ReduceLightsAABB" + , [&clusters, &device]( crg::FramePass const & framePass + , crg::GraphContext & context + , crg::RunnableGraph & graph ) + { + auto result = std::make_unique< rdclgb::FramePass >( framePass + , context + , graph + , device + , clusters ); + device.renderSystem.getEngine()->registerTimer( framePass.getFullName() + , result->getTimer() ); + return result; + } ); + pass.addDependency( *previousPass ); + cameraUbo.createPassBinding( pass, rdclgb::eCamera ); + auto & lights = clusters.getCamera().getScene()->getLightCache(); + lights.createPassBinding( pass, rdclgb::eLights ); + clusters.getClustersUbo().createPassBinding( pass, rdclgb::eClusters ); + createClearableOutputStorageBinding( pass, uint32_t( rdclgb::eLightsAABB ), "C3D_LightsAABB", clusters.getLightsAABBBuffer(), 0u, ashes::WholeSize ); + return pass; + } + + //********************************************************************************************* +} diff --git a/source/Core/Castor3D/Render/Opaque/DeferredRendering.cpp b/source/Core/Castor3D/Render/Opaque/DeferredRendering.cpp index 6a537fa7ca..e3fb862923 100644 --- a/source/Core/Castor3D/Render/Opaque/DeferredRendering.cpp +++ b/source/Core/Castor3D/Render/Opaque/DeferredRendering.cpp @@ -261,6 +261,8 @@ namespace castor3d } ); result.addDependency( lastPass ); result.addInOutDepthStencilView( targetDepth ); + result.addImplicitColourView( m_technique.getSsaoResult().wholeViewId + , VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL ); result.addInOutColourView( targetResult ); result.addInOutColourView( m_technique.getRenderTarget().getVelocity()->targetViewId ); return result; diff --git a/source/Core/Castor3D/Render/RenderNodesPass.cpp b/source/Core/Castor3D/Render/RenderNodesPass.cpp index 1ae58684f7..da56a4ba9f 100644 --- a/source/Core/Castor3D/Render/RenderNodesPass.cpp +++ b/source/Core/Castor3D/Render/RenderNodesPass.cpp @@ -605,10 +605,10 @@ namespace castor3d , uint32_t & index ) { descriptorWrites.push_back( frustumClusters.getClustersUbo().getDescriptorWrite( index++ ) ); - bindBuffer( frustumClusters.getPointLightIndexBuffer().getBuffer(), descriptorWrites, index ); - bindBuffer( frustumClusters.getPointLightClusterBuffer().getBuffer(), descriptorWrites, index ); - bindBuffer( frustumClusters.getSpotLightIndexBuffer().getBuffer(), descriptorWrites, index ); - bindBuffer( frustumClusters.getSpotLightClusterBuffer().getBuffer(), descriptorWrites, index ); + bindBuffer( frustumClusters.getPointLightClusterIndexBuffer(), descriptorWrites, index ); + bindBuffer( frustumClusters.getPointLightClusterGridBuffer(), descriptorWrites, index ); + bindBuffer( frustumClusters.getSpotLightClusterIndexBuffer(), descriptorWrites, index ); + bindBuffer( frustumClusters.getSpotLightClusterGridBuffer(), descriptorWrites, index ); } bool RenderNodesPass::areValidPassFlags( PassComponentCombine const & passFlags )const diff --git a/source/Core/Castor3D/Render/RenderTechnique.cpp b/source/Core/Castor3D/Render/RenderTechnique.cpp index 6d53214a6c..2d52bbd66b 100644 --- a/source/Core/Castor3D/Render/RenderTechnique.cpp +++ b/source/Core/Castor3D/Render/RenderTechnique.cpp @@ -17,8 +17,7 @@ #include "Castor3D/Model/Mesh/Submesh/Submesh.hpp" #include "Castor3D/Render/RenderSystem.hpp" #include "Castor3D/Render/RenderTarget.hpp" -#include "Castor3D/Render/Clustered/AssignLightsToClusters.hpp" -#include "Castor3D/Render/Clustered/ComputeClustersAABB.hpp" +#include "Castor3D/Render/Clustered/FrustumClusters.hpp" #include "Castor3D/Render/EnvironmentMap/EnvironmentMap.hpp" #include "Castor3D/Render/GlobalIllumination/LightPropagationVolumes/LayeredLightPropagationVolumes.hpp" #include "Castor3D/Render/GlobalIllumination/LightPropagationVolumes/LightPropagationVolumes.hpp" @@ -387,20 +386,10 @@ namespace castor3d , visbuffer } , m_lastDepthPass{ &m_prepass.getLastPass() } , m_depthRangePass{ &m_prepass.getDepthRangePass() } - , m_computeClustersAABB{ ( C3D_UseClusteredRendering - ? &createComputeClustersAABBPass( m_graph + , m_clustersLastPass{ ( C3D_UseClusteredRendering + ? &m_renderTarget.getFrustumClusters().createFramePasses( m_graph , m_depthRangePass - , m_device - , m_cameraUbo - , m_renderTarget.getFrustumClusters() ) - : nullptr ) } - , m_dispatchLightInClusters{ ( C3D_UseClusteredRendering - ? &createAssignLightsToClustersPass( m_graph - , m_computeClustersAABB - , m_device - , m_renderTarget.getScene()->getLightCache() - , m_cameraUbo - , m_renderTarget.getFrustumClusters() ) + , m_cameraUbo ) : nullptr ) } , m_background{ doCreateBackgroundPass( progress ) } , m_opaque{ *this @@ -782,9 +771,9 @@ namespace castor3d , TechniquePassEvent::eBeforeBackground , &m_prepass.getLastPass() ); - if ( m_dispatchLightInClusters ) + if ( m_clustersLastPass ) { - previousPasses.push_back( m_dispatchLightInClusters ); + previousPasses.push_back( m_clustersLastPass ); } auto & graph = m_graph.createPassGroup( "Background" ); diff --git a/source/Core/Castor3D/Render/Transparent/TransparentRendering.cpp b/source/Core/Castor3D/Render/Transparent/TransparentRendering.cpp index 87ace2a256..a064615241 100644 --- a/source/Core/Castor3D/Render/Transparent/TransparentRendering.cpp +++ b/source/Core/Castor3D/Render/Transparent/TransparentRendering.cpp @@ -307,6 +307,8 @@ namespace castor3d result.addInOutDepthStencilView( targetDepth ); result.addImplicitColourView( m_mippedColour.targetViewId , VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL ); + result.addImplicitColourView( getOwner()->getSsaoResult().wholeViewId + , VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL ); auto & transparentPassResult = *m_transparentPassResult; result.addOutputColourView( transparentPassResult[WbTexture::eAccumulation].targetViewId , getClearValue( WbTexture::eAccumulation ) ); diff --git a/source/Core/Castor3D/Shader/ShaderModule.cpp b/source/Core/Castor3D/Shader/ShaderModule.cpp index 6ebb604dec..5fc389f12d 100644 --- a/source/Core/Castor3D/Shader/ShaderModule.cpp +++ b/source/Core/Castor3D/Shader/ShaderModule.cpp @@ -71,6 +71,14 @@ namespace castor3d return MaxPointShadowMapCount; } + void groupMemoryBarrierWithGroupSync( sdw::ShaderWriter & writer ) + { + writer.controlBarrier( sdw::type::Scope::eWorkgroup + , sdw::type::Scope::eWorkgroup + , ( sdw::type::MemorySemanticsMask::eAcquireRelease + | sdw::type::MemorySemanticsMask::eWorkgroupMemory ) ); + } + castor::String concatModelNames( castor::String lhs , castor::String rhs ) { diff --git a/source/Core/Castor3D/Shader/Shaders/GlslClusteredLights.cpp b/source/Core/Castor3D/Shader/Shaders/GlslClusteredLights.cpp index 35f4293032..5ec5a84b87 100644 --- a/source/Core/Castor3D/Shader/Shaders/GlslClusteredLights.cpp +++ b/source/Core/Castor3D/Shader/Shaders/GlslClusteredLights.cpp @@ -23,28 +23,28 @@ namespace castor3d::shader , binding++ , set , m_enabled ); - C3D_PointLightGridIndicesEx( writer + C3D_PointLightClusterIndexEx( writer , binding++ , set , m_enabled ); - C3D_PointLightGridClustersEx( writer + C3D_PointLightClusterGridEx( writer , binding++ , set , m_enabled ); - C3D_SpotLightGridIndicesEx( writer + C3D_SpotLightClusterIndexEx( writer , binding++ , set , m_enabled ); - C3D_SpotLightGridClustersEx( writer + C3D_SpotLightClusterGridEx( writer , binding++ , set , m_enabled ); m_clusterData = castor::makeUnique< shader::ClustersData >( c3d_clustersData ); - m_pointLightGridIndex = std::make_unique< sdw::UInt32Array >( c3d_pointLightGridIndices ); - m_pointLightGridCluster = std::make_unique< sdw::U32Vec2Array >( c3d_pointLightGridClusters ); - m_spotLightGridIndex = std::make_unique< sdw::UInt32Array >( c3d_spotLightGridIndices ); - m_spotLightGridCluster = std::make_unique< sdw::U32Vec2Array >( c3d_spotLightGridClusters ); + m_pointLightIndices = std::make_unique< sdw::UInt32Array >( c3d_pointLightClusterIndex ); + m_pointLightClusters = std::make_unique< sdw::U32Vec2Array >( c3d_pointLightClusterGrid ); + m_spotLightIndices = std::make_unique< sdw::UInt32Array >( c3d_spotLightClusterIndex ); + m_spotLightClusters = std::make_unique< sdw::U32Vec2Array >( c3d_spotLightClusterGrid ); } void ClusteredLights::computeCombinedDifSpec( shader::Lights & lights @@ -69,14 +69,14 @@ namespace castor3d::shader , m_clusterData->computeClusterIndex1D( clusterIndex3D ) ); auto pointStartOffset = m_writer.declLocale( "pointStartOffset" - , ( *m_pointLightGridCluster )[clusterIndex1D].x() ); + , ( *m_pointLightClusters )[clusterIndex1D].x() ); auto pointLightCount = m_writer.declLocale( "pointLightCount" - , ( *m_pointLightGridCluster )[clusterIndex1D].y() ); + , ( *m_pointLightClusters )[clusterIndex1D].y() ); FOR( m_writer, sdw::UInt, i, 0_u, i < pointLightCount, ++i ) { auto lightOffset = m_writer.declLocale( "lightOffset" - , ( *m_pointLightGridIndex )[pointStartOffset + i] ); + , ( *m_pointLightIndices )[pointStartOffset + i] ); auto light = m_writer.declLocale( "point", lights.getPointLight( lightOffset ) ); lightingModel.compute( light , components @@ -87,14 +87,14 @@ namespace castor3d::shader ROF; auto spotStartOffset = m_writer.declLocale( "spotStartOffset" - , (*m_spotLightGridCluster)[clusterIndex1D].x() ); + , ( *m_spotLightClusters )[clusterIndex1D].x() ); auto spotLightCount = m_writer.declLocale( "spotLightCount" - , ( *m_spotLightGridCluster )[clusterIndex1D].y() ); + , ( *m_spotLightClusters )[clusterIndex1D].y() ); FOR( m_writer, sdw::UInt, i, 0_u, i < spotLightCount, ++i ) { auto lightOffset = m_writer.declLocale( "lightOffset" - , ( *m_spotLightGridIndex )[spotStartOffset + i] ); + , ( *m_spotLightIndices )[spotStartOffset + i] ); auto light = m_writer.declLocale( "spot", lights.getSpotLight( lightOffset ) ); lightingModel.compute( light , components diff --git a/source/Core/Castor3D/Shader/Ubos/ClustersUbo.cpp b/source/Core/Castor3D/Shader/Ubos/ClustersUbo.cpp index a8332e79de..739d71650f 100644 --- a/source/Core/Castor3D/Shader/Ubos/ClustersUbo.cpp +++ b/source/Core/Castor3D/Shader/Ubos/ClustersUbo.cpp @@ -2,6 +2,7 @@ #include "Castor3D/Buffer/UniformBufferPool.hpp" #include "Castor3D/Render/RenderDevice.hpp" +#include "Castor3D/Render/Clustered/FrustumClusters.hpp" #include @@ -99,7 +100,9 @@ namespace castor3d void ClustersUbo::cpuUpdate( castor::Point3ui gridDim , float viewNear , uint32_t clusterSize - , float nearK ) + , float nearK + , uint32_t pointLightsCount + , uint32_t spotLightsCount ) { CU_Require( m_ubo ); auto & configuration = m_ubo.getData(); @@ -108,6 +111,10 @@ namespace castor3d configuration.viewNear = viewNear; configuration.nearK = nearK; configuration.logGridDimY = 1.0f / std::log( nearK ); + configuration.pointLightsCount = pointLightsCount; + configuration.spotLightsCount = spotLightsCount; + configuration.pointLightLevelsCount = FrustumClusters::getNumLevels( pointLightsCount ); + configuration.spotLightLevelsCount = FrustumClusters::getNumLevels( spotLightsCount ); } //********************************************************************************************* diff --git a/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp b/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp index af68d3fb8c..a5cd31fffd 100644 --- a/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp +++ b/source/Core/CastorUtils/CastorUtilsPrerequisites.cpp @@ -89,6 +89,6 @@ namespace castor stream << "Assertion failed: " << description << "\n"; stream << Debug::Backtrace{}; cuLogError( stream.str().c_str() ); - std::abort(); + assert( false ); } }