From f6a447e31ae7c4583c765042071f48421269cfce Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Tue, 14 May 2024 14:56:32 +0800 Subject: [PATCH] Update xgl from commit 2a97f7f5 Update Khronos Vulkan Headers to 1.3.280 Release PHOENIX2 Expose the extension VK_EXT_nested_command_buffer Expose the extension VK_KHR_dynamic_rendering_local_read Expose the extension VK_KHR_shader_maximal_reconvergence Enable VK_KHR_shader_maximal_reconvergence for pre-Navi3 Accurately consider extended usage in VkGetPhysicalDeviceImageFormatProperties2 Add EnableFastLBVH setting Add mapping P012: VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16 to P016 format Add support for experiments Add TriangleSplittingBudgetPerTriangle and tsPriority GPURT settings Changes to separate legacy/acq-rel in render pass code Enable 8-bit integer dot product with accumulation properties Enable RT Recursion Implement VK_NV_device_generated_commands MaxWavesPerCu graphics fix follow up OR shaderIdExtraBits into traceRayGpuVa Refine pipeline dump and option waitForDebugger Refine pipeline dumps for graphics pipeline library Remove IsDxgiEnabled check Set EnableTraceRayAccelStructTracking True Set TriangleSplittingFactor to 1.15 Split DevModeMgr into UberTrace and RGP paths Switche app_profile.cpp::GetExecutableName to fetch the executable name using open() instead of popen() Update PAL Version to 867 Bump LLPC version to 71 Fix bind buffers 2 Fix counter overflow with deferred operations Fix crash in DXVK Left 4 Dead 2 Fix crash in GPURT client callback causing CTS failures Fix MaxWavesPerCu for graphics shaders Fix mutable descriptors when no size is specified Fix typos in pipeline dump and shader module id Fixes dEQP-GLES31.functional.shaders.* failures in Angle --- cmake/XglCompileDefinitions.cmake | 4 + cmake/XglOptions.cmake | 2 + cmake/XglOverrides.cmake | 4 + cmake/XglVersions.cmake | 4 +- icd/CMakeLists.txt | 34 +- icd/Loader/LunarG/Lnx/amd-icd.json | 4 +- icd/api/app_profile.cpp | 50 +- icd/api/app_shader_optimizer.cpp | 29 +- icd/api/cmd_buffer_ring.cpp | 2 +- icd/api/compiler_solution.cpp | 7 +- icd/api/compiler_solution_llpc.cpp | 55 +- icd/api/debug_printf.cpp | 2 +- icd/api/devmode/devmode_mgr.h | 341 +---------- .../{devmode_mgr.cpp => devmode_rgp.cpp} | 180 ++++-- icd/api/devmode/devmode_rgp.h | 320 ++++++++++ icd/api/devmode/devmode_ubertrace.cpp | 577 ++++++++++++++++++ icd/api/devmode/devmode_ubertrace.h | 148 +++++ icd/api/entry.cpp | 35 ++ icd/api/graphics_pipeline_common.cpp | 22 +- icd/api/include/app_profile.h | 2 - icd/api/include/app_shader_optimizer.h | 2 +- icd/api/include/compiler_solution.h | 11 + icd/api/include/defer_compile_thread.h | 2 +- icd/api/include/graphics_pipeline_common.h | 4 +- .../khronos/sdk-1.3/vulkan/vulkan_core.h | 15 +- icd/api/include/pipeline_binary_cache.h | 6 +- icd/api/include/pipeline_compiler.h | 17 + icd/api/include/vk_cmdbuffer.h | 67 +- icd/api/include/vk_compute_pipeline.h | 2 +- icd/api/include/vk_conv.h | 72 +++ icd/api/include/vk_device.h | 24 +- icd/api/include/vk_extensions.h | 4 + icd/api/include/vk_formats.h | 13 +- icd/api/include/vk_indirect_commands_layout.h | 147 +++++ icd/api/include/vk_instance.h | 8 +- icd/api/include/vk_physical_device_manager.h | 2 + icd/api/include/vk_pipeline.h | 14 +- icd/api/include/vk_queue.h | 37 +- icd/api/include/vk_swapchain.h | 25 +- icd/api/include/vk_utils.h | 4 +- icd/api/pipeline_binary_cache.cpp | 24 +- icd/api/pipeline_compiler.cpp | 136 +++-- icd/api/raytrace/ray_tracing_device.cpp | 23 +- icd/api/raytrace/vk_ray_tracing_pipeline.cpp | 225 ++++--- icd/api/renderpass/renderpass_builder.cpp | 112 +++- icd/api/renderpass/renderpass_types.h | 6 +- icd/api/sqtt/sqtt_layer.cpp | 64 +- icd/api/sqtt/sqtt_layer.h | 4 +- icd/api/strings/entry_points.txt | 11 + icd/api/strings/extensions.txt | 3 + icd/api/strings/generate_strings.py | 2 +- icd/api/vk_cmdbuffer.cpp | 357 +++++++++-- icd/api/vk_compute_pipeline.cpp | 142 +++-- icd/api/vk_conv.cpp | 56 ++ icd/api/vk_descriptor_pool.cpp | 4 +- icd/api/vk_device.cpp | 62 +- icd/api/vk_dispatch.cpp | 11 + icd/api/vk_formats.cpp | 341 +++++++++++ icd/api/vk_graphics_pipeline.cpp | 229 ++++--- icd/api/vk_graphics_pipeline_library.cpp | 46 +- icd/api/vk_indirect_commands_layout.cpp | 382 ++++++++++++ icd/api/vk_instance.cpp | 42 +- icd/api/vk_physical_device.cpp | 129 +++- icd/api/vk_physical_device_manager.cpp | 35 +- icd/api/vk_pipeline.cpp | 64 +- icd/api/vk_queue.cpp | 273 ++++++--- icd/api/vk_semaphore.cpp | 2 +- icd/api/vk_swapchain.cpp | 65 +- icd/api/vk_utils.cpp | 2 - icd/res/ver.h | 4 +- icd/settings/experimentsLoader.cpp | 71 +++ icd/settings/experimentsLoader.h | 79 +++ icd/settings/experiments_settings_xgl.json | 287 +++++++++ icd/settings/settings.cpp | 318 +++++++++- icd/settings/settings.h | 15 +- icd/settings/settings_xgl.json | 102 ++-- 76 files changed, 4827 insertions(+), 1199 deletions(-) rename icd/api/devmode/{devmode_mgr.cpp => devmode_rgp.cpp} (95%) create mode 100644 icd/api/devmode/devmode_rgp.h create mode 100644 icd/api/devmode/devmode_ubertrace.cpp create mode 100644 icd/api/devmode/devmode_ubertrace.h create mode 100644 icd/api/include/vk_indirect_commands_layout.h create mode 100644 icd/api/vk_indirect_commands_layout.cpp create mode 100644 icd/settings/experimentsLoader.cpp create mode 100644 icd/settings/experimentsLoader.h create mode 100644 icd/settings/experiments_settings_xgl.json diff --git a/cmake/XglCompileDefinitions.cmake b/cmake/XglCompileDefinitions.cmake index f015f753..8ddc2268 100644 --- a/cmake/XglCompileDefinitions.cmake +++ b/cmake/XglCompileDefinitions.cmake @@ -83,6 +83,10 @@ macro(xgl_set_compile_definitions) target_compile_definitions(xgl PRIVATE VKI_BUILD_PHOENIX1=1) endif() + if(XGL_BUILD_PHOENIX2) + target_compile_definitions(xgl PRIVATE VKI_BUILD_PHOENIX2=1) + endif() + if(XGL_BUILD_REMBRANDT) target_compile_definitions(xgl PRIVATE VKI_BUILD_REMBRANDT=1) endif() diff --git a/cmake/XglOptions.cmake b/cmake/XglOptions.cmake index f89534a1..a01a691e 100644 --- a/cmake/XglOptions.cmake +++ b/cmake/XglOptions.cmake @@ -63,6 +63,8 @@ macro(xgl_options) option(XGL_BUILD_PHOENIX1 "Build vulkan for PHOENIX1" ON) + option(XGL_BUILD_PHOENIX2 "Build vulkan for PHOENIX2" ON) + option(XGL_BUILD_TESTS "Build tests?" OFF) option(XGL_BUILD_TOOLS "Build tools?" OFF) diff --git a/cmake/XglOverrides.cmake b/cmake/XglOverrides.cmake index c80e77f7..d3939cd0 100644 --- a/cmake/XglOverrides.cmake +++ b/cmake/XglOverrides.cmake @@ -110,6 +110,8 @@ macro(xgl_overrides_pal) set(PAL_BUILD_GFX11 1 CACHE BOOL "${PROJECT_NAME} override." FORCE) endif() + set(PAL_BUILD_PHOENIX2 ${XGL_BUILD_PHOENIX2} CACHE BOOL "${PROJECT_NAME} override." FORCE) + # Wayland set(PAL_BUILD_WAYLAND ${BUILD_WAYLAND_SUPPORT} CACHE BOOL "Build PAL with Wayland support" FORCE) @@ -160,6 +162,8 @@ macro(xgl_overrides_vkgc) set(LLPC_BUILD_PHOENIX1 ${XGL_BUILD_PHOENIX1} CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_PHOENIX2 ${XGL_BUILD_PHOENIX2} CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_ENABLE_WERROR ${ICD_ANALYSIS_WARNINGS_AS_ERRORS} CACHE BOOL "${PROJECT_NAME} override." FORCE) endmacro() diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake index ef0908bd..02029eeb 100644 --- a/cmake/XglVersions.cmake +++ b/cmake/XglVersions.cmake @@ -28,7 +28,7 @@ include_guard() # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -set(ICD_PAL_CLIENT_MAJOR_VERSION "856") +set(ICD_PAL_CLIENT_MAJOR_VERSION "867") # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports. @@ -42,4 +42,4 @@ set(ICD_GPURT_CLIENT_MAJOR_VERSION "46") # This will become the value of LLPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_LLPC=1. # It describes the version of the interface version of LLPC that the ICD supports. -set(ICD_LLPC_CLIENT_MAJOR_VERSION "70") +set(ICD_LLPC_CLIENT_MAJOR_VERSION "71") diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index e9d1d557..2da6c892 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -150,6 +150,7 @@ target_sources(xgl PRIVATE api/vk_gpa_session.cpp api/vk_descriptor_update_template.cpp api/vk_utils.cpp + api/vk_indirect_commands_layout.cpp api/appopt/barrier_filter_layer.cpp api/appopt/strange_brigade_layer.cpp api/appopt/g_shader_profile.cpp @@ -302,6 +303,34 @@ target_sources(xgl PRIVATE settings/settings_xgl.json ) +add_custom_command( + OUTPUT ${ICD_SETTINGS_DIR}/g_experiments.cpp ${ICD_SETTINGS_DIR}/g_experiments.h + COMMAND ${PYTHON_CMD} ${ICD_GEN_SETTINGS} + -i ${ICD_SETTINGS_DIR}/experiments_settings_xgl.json + -o ${ICD_SETTINGS_DIR} + -g experiments + -s settings/experimentsLoader.h + --namespaces vk + --settings-struct-name ExpSettings + --classname ExperimentsLoader + DEPENDS ${ICD_GEN_SETTINGS_FILES} ${ICD_SETTINGS_DIR}/experiments_settings_xgl.json + COMMENT "Generating Vulkan settings code from experiments_settings_xgl.json" +) + +add_custom_target( + RunVKExperimentsGenerator + DEPENDS ${ICD_GEN_SETTINGS_FILES} ${ICD_SETTINGS_DIR}/experiments_settings_xgl.json + COMMENT "Checking if re-generation is required for settings" +) + +add_dependencies(xgl RunVKExperimentsGenerator) + +target_sources(xgl PRIVATE + settings/g_experiments.cpp + settings/experimentsLoader.cpp + settings/experiments_settings_xgl.json +) + ### ICD api/sqtt ############################################################## target_sources(xgl PRIVATE api/sqtt/sqtt_layer.cpp @@ -311,7 +340,10 @@ target_sources(xgl PRIVATE ### ICD api/devmode ########################################################### if(ICD_GPUOPEN_DEVMODE_BUILD) - target_sources(xgl PRIVATE api/devmode/devmode_mgr.cpp) + target_sources(xgl PRIVATE + api/devmode/devmode_rgp.cpp + api/devmode/devmode_ubertrace.cpp + ) endif() ### ICD layer ################################################################## diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index 7e32c74a..c6aed269 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.279" + "api_version": "1.3.280" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.279", + "api_version": "1.3.280", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/app_profile.cpp b/icd/api/app_profile.cpp index 17df92c0..cf466325 100644 --- a/icd/api/app_profile.cpp +++ b/icd/api/app_profile.cpp @@ -40,6 +40,7 @@ #if defined(__unix__) #include +#include #include #endif @@ -731,18 +732,6 @@ constexpr AppProfilePatternEntry AppEngineQuanticDream "quantic dream engine" }; -constexpr AppProfilePatternEntry AppNameEnshrouded = -{ - PatternAppNameLower, - "enshrouded" -}; - -constexpr AppProfilePatternEntry AppEngineHolistic = -{ - PatternEngineNameLower, - "holistic" -}; - constexpr AppProfilePatternEntry PatternEnd = {}; // This is a table of patterns. The first matching pattern in this table will be returned. @@ -1477,23 +1466,6 @@ AppProfilePattern AppPatternTable[] = } }, - { - AppProfile::Enshrouded, - { - AppNameEnshrouded, - AppEngineHolistic, - PatternEnd - } - }, - - { - AppProfile::HolisticEngine, - { - AppEngineHolistic, - PatternEnd - } - }, - { AppProfile::Zink, { @@ -1702,24 +1674,14 @@ static char* GetExecutableName( size_t* pLength, bool includeExtension) // true if you want the extension on the file name. { - pid_t pid = getpid(); char* pExecutable = nullptr; - char* pModuleFileName = nullptr; char path[PATH_MAX] = {0}; - char commandStringBuffer[PATH_MAX] = {0}; - sprintf(commandStringBuffer, "cat /proc/%d/cmdline", pid); - FILE* pCommand = popen(commandStringBuffer, "r"); - if (pCommand != nullptr) + pExecutable = static_cast(malloc(PATH_MAX)); + + if (pExecutable != nullptr) { - if (fgets(path, PATH_MAX, pCommand) != nullptr) - { - pExecutable = static_cast(malloc(PATH_MAX)); - pModuleFileName = strrchr(path, '/') ? strrchr(path, '/') + 1 : path; - pModuleFileName = strrchr(pModuleFileName, '\\') ? strrchr(pModuleFileName, '\\') + 1 : pModuleFileName; - strcpy(pExecutable, pModuleFileName); - *pLength = strlen(pExecutable); - } - pclose(pCommand); + utils::GetExecutableNameAndPath(pExecutable, &path[0]); + *pLength = strlen(pExecutable); } return pExecutable; } diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp index 6988ec7e..475b230f 100644 --- a/icd/api/app_shader_optimizer.cpp +++ b/icd/api/app_shader_optimizer.cpp @@ -592,14 +592,20 @@ void ShaderOptimizer::ApplyProfileToDynamicComputeShaderInfo( } // ===================================================================================================================== -void ShaderOptimizer::ApplyProfileToDynamicGraphicsShaderInfo( +bool ShaderOptimizer::ApplyProfileToDynamicGraphicsShaderInfo( const ShaderProfileAction& action, Pal::DynamicGraphicsShaderInfo* pGraphicsShaderInfo) const { + bool hasUpdate = false; + if (action.dynamicShaderInfo.apply.maxWavesPerCu) { pGraphicsShaderInfo->maxWavesPerCu = static_cast(action.dynamicShaderInfo.maxWavesPerCu); + + hasUpdate = true; } + + return hasUpdate; } // ===================================================================================================================== @@ -631,25 +637,32 @@ void ShaderOptimizer::ApplyProfileToGraphicsPipelineCreateInfo( switch (vkgcStage) { case ShaderStage::ShaderStageTask: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ts); + pGraphicsShaderInfos->enable.ts |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ts); break; case ShaderStage::ShaderStageVertex: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->vs); + pGraphicsShaderInfos->enable.vs |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->vs); break; case ShaderStage::ShaderStageTessControl: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->hs); + pGraphicsShaderInfos->enable.hs |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->hs); break; case ShaderStage::ShaderStageTessEval: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ds); + pGraphicsShaderInfos->enable.ds |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ds); break; case ShaderStage::ShaderStageGeometry: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->gs); + pGraphicsShaderInfos->enable.gs |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->gs); break; case ShaderStage::ShaderStageMesh: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ms); + pGraphicsShaderInfos->enable.ms |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ms); break; case ShaderStage::ShaderStageFragment: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ps); + pGraphicsShaderInfos->enable.ps |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ps); break; default: PAL_ASSERT_ALWAYS(); diff --git a/icd/api/cmd_buffer_ring.cpp b/icd/api/cmd_buffer_ring.cpp index 63a96f0a..40bf1556 100644 --- a/icd/api/cmd_buffer_ring.cpp +++ b/icd/api/cmd_buffer_ring.cpp @@ -188,7 +188,7 @@ void CmdBufferRing::DestroyCmdBufState( // Wait to finish in case still in flight if (pCmdBufState->pFence->GetStatus() == Pal::Result::NotReady) { - pDevice->PalDevice(deviceIdx)->WaitForFences(1, &pCmdBufState->pFence, true, ~0ULL); + pDevice->PalDevice(deviceIdx)->WaitForFences(1, &pCmdBufState->pFence, true, std::chrono::nanoseconds::max()); } // Destroy Fence diff --git a/icd/api/compiler_solution.cpp b/icd/api/compiler_solution.cpp index d20068c3..3942a4a1 100644 --- a/icd/api/compiler_solution.cpp +++ b/icd/api/compiler_solution.cpp @@ -206,7 +206,7 @@ void CompilerSolution::StoreShaderBinaryToCache( if (updateBinaryCache || updateAppCache || (pCacheBinary->pCode == nullptr)) { - if ((pHeader->binaryLength > 0) && (pCacheBinary->codeSize == 0)) + if (((pHeader->binaryLength > 0) || (pHeader->requireFullPipeline)) && (pCacheBinary->codeSize == 0)) { size_t cacheSize = sizeof(ShaderLibraryBlobHeader) + pHeader->binaryLength + pHeader->fragMetaLength; @@ -218,7 +218,10 @@ void CompilerSolution::StoreShaderBinaryToCache( if (pBuffer != nullptr) { memcpy(pBuffer, pHeader, sizeof(ShaderLibraryBlobHeader)); - memcpy(Util::VoidPtrInc(pBuffer, sizeof(ShaderLibraryBlobHeader)), pBlob, pHeader->binaryLength); + if (pBlob != nullptr) + { + memcpy(Util::VoidPtrInc(pBuffer, sizeof(ShaderLibraryBlobHeader)), pBlob, pHeader->binaryLength); + } if (pFragmentMeta != nullptr) { memcpy(Util::VoidPtrInc(pBuffer, sizeof(ShaderLibraryBlobHeader) + pHeader->binaryLength), diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index 5cb39cc5..15be0b29 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -507,7 +507,22 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( } } - if (hitCache == false) + bool checkShaderModuleIdUsage = false; + if (hitCache) + { + const auto* pShaderLibraryHeader = + reinterpret_cast(shaderLibraryBinary.pCode); + if (pShaderLibraryHeader->requireFullPipeline) + { + checkShaderModuleIdUsage = true; + } + } + else + { + checkShaderModuleIdUsage = true; + } + + if (checkShaderModuleIdUsage) { for (uint32_t stage = 0; stage < ShaderStage::ShaderStageGfxCount; stage++) { @@ -552,10 +567,14 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( if (llpcResult == Vkgc::Result::Success) { - blobHeader.binaryLength = finalBinary.codeSize; + blobHeader.binaryLength = finalBinary.codeSize; blobHeader.fragMetaLength = pipelineOut.fsOutputMetaDataSize; } - else if (llpcResult != Vkgc::Result::RequireFullPipeline) + else if (llpcResult == Vkgc::Result::RequireFullPipeline) + { + blobHeader.requireFullPipeline = true; + } + else { result = (llpcResult == Vkgc::Result::ErrorOutOfMemory) ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_ERROR_INITIALIZATION_FAILED; @@ -567,18 +586,15 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( // Always call StoreShaderBinaryToCache to sync data between app cache and binary cache except // RequireFullPipeline. When cache is hit, blobHeader is zero, StoreShaderBinaryToCache will ignore // finalBinary, and reuse shaderLibraryBinary. - if ((finalBinary.pCode != nullptr) || (shaderLibraryBinary.pCode != nullptr)) - { - StoreShaderBinaryToCache( - pPipelineCache, - &cacheId, - &blobHeader, - finalBinary.pCode, - pipelineOut.fsOutputMetaData, - hitCache, - hitAppCache, - &shaderLibraryBinary); - } + StoreShaderBinaryToCache( + pPipelineCache, + &cacheId, + &blobHeader, + finalBinary.pCode, + pipelineOut.fsOutputMetaData, + hitCache, + hitAppCache, + &shaderLibraryBinary); pModuleState->elfPackage = shaderLibraryBinary; pModuleState->pFsOutputMetaData = nullptr; @@ -830,7 +846,7 @@ void LlpcHelperThreadProvider::WaitForTasks() { while (m_pDeferredWorkload->completedInstances < m_pDeferredWorkload->totalInstances) { - m_pDeferredWorkload->event.Wait(1.0f); + m_pDeferredWorkload->event.Wait(Util::fseconds { 1.0f }); } } @@ -1229,8 +1245,11 @@ Vkgc::BinaryData CompilerSolutionLlpc::ExtractPalElfBinary( { Vkgc::BinaryData elfBinary = {}; const ShaderLibraryBlobHeader* pHeader = reinterpret_cast(shaderBinary.pCode); - elfBinary.pCode = pHeader + 1; - elfBinary.codeSize = pHeader->binaryLength; + if (pHeader->binaryLength > 0) + { + elfBinary.pCode = pHeader + 1; + elfBinary.codeSize = pHeader->binaryLength; + } return elfBinary; } diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp index 1517112a..6fd013ac 100644 --- a/icd/api/debug_printf.cpp +++ b/icd/api/debug_printf.cpp @@ -214,7 +214,7 @@ Pal::Result DebugPrintf::PostQueueProcess( while (true) { palResult = pDevice->PalDevice(DefaultDeviceIndex)->WaitForSemaphores( - 1, palSemaphores, waitValues, 0, 1000000llu); + 1, palSemaphores, waitValues, 0, std::chrono::nanoseconds {1000000llu}); decodeOffset = ProcessDebugPrintfBuffer(pDevice, deviceIdx, decodeOffset, &file); if ((PalToVkResult(palResult) <= 0) || (loopIndex++ > 1000)) diff --git a/icd/api/devmode/devmode_mgr.h b/icd/api/devmode/devmode_mgr.h index 0a99d53e..61d7ce80 100644 --- a/icd/api/devmode/devmode_mgr.h +++ b/icd/api/devmode/devmode_mgr.h @@ -25,7 +25,7 @@ /** *********************************************************************************************************************** * @file devmode_mgr.h -* @brief Contains the GPU Open Developer Mode manager (DevModeMgr) +* @brief Contains the GPU Open Developer Mode interface (IDevMode) *********************************************************************************************************************** */ @@ -39,60 +39,12 @@ #include "include/vk_device.h" // PAL headers -#include "palHashMap.h" #include "palQueue.h" -#include "palUtil.h" -#include "palList.h" -#include "palVector.h" - -// gpuutil headers -#include "gpuUtil/palGpaSession.h" -#if ICD_GPUOPEN_DEVMODE_BUILD -// gpuopen headers -#include "gpuopen.h" - -#endif - -// PAL forward declarations -namespace Pal -{ -class ICmdBuffer; -class IFence; -class IQueueSemaphore; -struct PalPublicSettings; -} - -// GPUOpen forward declarations -namespace DevDriver -{ -class DevDriverServer; -class PipelineUriService; -class IMsgChannel; -struct MessageBuffer; - -namespace DriverControlProtocol -{ -enum struct DeviceClockMode : uint32_t; -class HandlerServer; -} - -namespace SettingsProtocol -{ -class HandlerServer; -struct Setting; -} - -namespace RGPProtocol -{ -class RGPServer; -} -} // Vulkan forward declarations namespace vk { class Instance; -class Queue; class Pipeline; #if VKI_RAY_TRACING class RayTracingPipeline; @@ -107,26 +59,18 @@ namespace vk // ===================================================================================================================== // This class provides functionality to interact with the GPU Open Developer Mode message passing service and the rest // of the driver. -class DevModeMgr +class IDevMode { #if ICD_GPUOPEN_DEVMODE_BUILD public: - // Number of frames to wait before collecting a hardware trace. - // Note: This will be replaced in the future by a remotely configurable value provided by the RGP server. - static constexpr uint32_t NumTracePreparationFrames = 4; - // Pipeline hash used for instruction tracing whenever no pipeline is being targetted. static constexpr uint64_t InvalidTargetPipelineHash = 0; - ~DevModeMgr(); - - static VkResult Create(Instance* pInstance, DevModeMgr** ppObject); - - void Finalize( + virtual void Finalize( uint32_t deviceCount, - VulkanSettingsLoader* settingsLoaders[]); + VulkanSettingsLoader* settingsLoaders[]) = 0; - void Destroy(); + virtual void Destroy() = 0; enum class FrameDelimiterType : uint32_t { @@ -136,280 +80,61 @@ class DevModeMgr Count }; - void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType); - void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType); - void WaitForDriverResume(); - void PipelineCreated(Device* pDevice, Pipeline* pPipeline); - void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline); + virtual void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType) = 0; + virtual void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType) = 0; + virtual void WaitForDriverResume() = 0; + virtual void PipelineCreated(Device* pDevice, Pipeline* pPipeline) = 0; + virtual void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline) = 0; #if VKI_RAY_TRACING - void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline); - void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline); + virtual void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline) = 0; + virtual void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline) = 0; #endif - void PostDeviceCreate(Device* pDevice); - void PreDeviceDestroy(Device* pDevice); - void NotifyPreSubmit(); + virtual void PostDeviceCreate(Device* pDevice) = 0; + virtual void PreDeviceDestroy(Device* pDevice) = 0; + virtual void NotifyPreSubmit() = 0; - uint64_t GetInstructionTraceTargetHash(); - void StartInstructionTrace(CmdBuffer* pCmdBuffer); - void StopInstructionTrace(CmdBuffer* pCmdBuffer); + virtual uint64_t GetInstructionTraceTargetHash() = 0; + virtual void StartInstructionTrace(CmdBuffer* pCmdBuffer) = 0; + virtual void StopInstructionTrace(CmdBuffer* pCmdBuffer) = 0; - bool IsTracingEnabled() const; - bool IsCrashAnalysisEnabled() const { return m_crashAnalysisEnabled; } + virtual bool IsTracingEnabled() const = 0; + virtual bool IsCrashAnalysisEnabled() const = 0; - Pal::Result TimedQueueSubmit( + virtual Pal::Result TimedQueueSubmit( uint32_t deviceIdx, Queue* pQueue, uint32_t cmdBufferCount, const VkCommandBuffer* pCommandBuffers, const Pal::SubmitInfo& submitInfo, - VirtualStackFrame* pVirtStackFrame); + VirtualStackFrame* pVirtStackFrame) = 0; - Pal::Result TimedSignalQueueSemaphore( + virtual Pal::Result TimedSignalQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, uint64_t value, - Pal::IQueueSemaphore* pQueueSemaphore); + Pal::IQueueSemaphore* pQueueSemaphore) = 0; - Pal::Result TimedWaitQueueSemaphore( + virtual Pal::Result TimedWaitQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, uint64_t value, - Pal::IQueueSemaphore* pQueueSemaphore); + Pal::IQueueSemaphore* pQueueSemaphore) = 0; - inline bool IsQueueTimingActive(const Device* pDevice) const; - inline bool GetTraceFrameBeginTag(uint64_t* pTag) const; - inline bool GetTraceFrameEndTag(uint64_t* pTag) const; + virtual bool IsQueueTimingActive(const Device* pDevice) const = 0; + virtual bool GetTraceFrameBeginTag(uint64_t* pTag) const = 0; + virtual bool GetTraceFrameEndTag(uint64_t* pTag) const = 0; - Util::Result RegisterPipelineCache( + virtual Util::Result RegisterPipelineCache( PipelineBinaryCache* pPipelineCache, - uint32_t postSizeLimit); - - void DeregisterPipelineCache( - PipelineBinaryCache* pPipelineCache); - Util::ListIterator GetPipelineCacheListIterator() - { return m_pipelineCaches.Begin(); } - - Util::RWLock* GetPipelineReinjectionLock() - { return &m_pipelineReinjectionLock; } - -private: - // Steps that an RGP trace goes through - enum class TraceStatus : uint32_t - { - // "Pre-trace" stages: - Idle = 0, // No active trace and none requested - Pending, // We've identified that a trace has been requested and we've received its parameters, - // but we have not yet seen the first frame. - Preparing, // A trace has been requested but is not active yet because we are - // currently sampling timing information over some number of lead frames. - Running, // SQTT and queue timing is currently active for all command buffer submits. - - // "Post-trace" stages: - WaitingForSqtt, // Command to turn off SQTT has been submitted and we're waiting for fence confirmation. - Ending // Tracing is no longer active, but all results are not yet ready. - }; - - // Various trigger modes supported for RGP traces - enum class TriggerMode : uint32_t - { - Present = 0, // Traces triggered by presents - Index, // Traces triggered by frame indices - Tag // Traces triggered by command buffer tags - }; - - // Queue family (type)-specific state to support RGP tracing (part of device state) - struct TraceQueueFamilyState - { - uint32_t queueFamilyIndex; - Pal::QueueType queueType; - Pal::EngineType engineType; - Pal::ICmdBuffer* pTraceBeginCmdBuf; - Pal::ICmdBuffer* pTraceBeginSqttCmdBuf; - Pal::ICmdBuffer* pTraceEndSqttCmdBuf; - Pal::ICmdBuffer* pTraceEndCmdBuf; - Pal::ICmdBuffer* pTraceFlushCmdBuf; - bool supportsTracing; - bool usedForBegin; - bool usedForEndSqtt; - bool usedForEnd; - }; - - // Queue-specific resources to support RGP tracing (part of device state) - struct TraceQueueState - { - const Queue* pQueue; - TraceQueueFamilyState* pFamily; - Pal::uint64 queueId; - Pal::uint64 queueContext; - bool timingSupported; - }; - - static constexpr uint32_t MaxTraceQueueFamilies = Queue::MaxQueueFamilies; - static constexpr uint32_t MaxTraceQueues = MaxTraceQueueFamilies * Queue::MaxQueuesPerFamily; - - // All per-device state to support RGP tracing - struct TraceState - { - TraceStatus status; // Current trace status (idle, running, etc.) - bool labelDelimsPresent; // True is a label delimiter is recieved - - Device* pDevice; // The device currently doing the tracing - Pal::ICmdAllocator* pCmdAllocator; // Command allocator for creating trace-begin/end buffers - Pal::IFence* pBeginFence; // Fence that is signaled when a trace-begin cmdbuf retires - Pal::IFence* pEndSqttFence; // Fence that is signaled when a trace-end cmdbuf retires - Pal::IFence* pEndFence; // Fence that is signaled when a trace-end cmdbuf retires - TraceQueueState* pTracePrepareQueue; // The queue that triggered the full start of a trace - TraceQueueState* pTraceBeginQueue; // The queue that triggered starting SQTT - TraceQueueState* pTraceEndSqttQueue; // The queue that triggered ending SQTT - TraceQueueState* pTraceEndQueue; // The queue that triggered the full end of a trace - - GpuUtil::GpaSession* pGpaSession; // GPA session helper object for building RGP data - uint32_t gpaSampleId; // Sample ID associated with the current trace - bool queueTimingEnabled; // Queue timing is enabled - bool flushAllQueues; // Flushes all queues during the last preparation frame. - - // Queue-specific state/information for tracing: - uint32_t queueCount; - TraceQueueState queueState[MaxTraceQueues]; - uint32_t auxQueueCount; - TraceQueueState auxQueueStates[MaxTraceQueues]; // Used for queues belonging to other logical devices - // pointing to the same physical device - uint32_t queueFamilyCount; - TraceQueueFamilyState queueFamilyState[MaxTraceQueueFamilies]; - - uint32_t activeCmdBufCount; // Number of command buffers in below list - Pal::ICmdBuffer* pActiveCmdBufs[4]; // List of command buffers that need to be reset at end of trace - uint32_t preparedFrameCount; // Number of frames counted while preparing for a trace - uint32_t sqttFrameCount; // Number of frames counted while SQTT tracing is active - uint64_t frameBeginTag; // If a command buffer with this debug-tag is submitted, it is - // treated as a virtual frame-start event. - uint64_t frameEndTag; // Similarly to above but for frame-end post-submit. - }; - - DevModeMgr(Instance* pInstance); - - Pal::Result Init(); - - void AdvanceActiveTraceStep(TraceState* pState, const Queue* pQueue, bool beginFrame, FrameDelimiterType delimiterType); - void TraceIdleToPendingStep(TraceState* pState); - Pal::Result TracePendingToPreparingStep(TraceState* pState, const Queue* pQueue, FrameDelimiterType delimiterType); - Pal::Result TracePreparingToRunningStep(TraceState* pState, const Queue* pQueue); - Pal::Result TraceRunningToWaitingForSqttStep(TraceState* pState, const Queue* pQueue); - Pal::Result TraceWaitingForSqttToEndingStep(TraceState* pState, const Queue* pQueue); - Pal::Result TraceEndingToIdleStep(TraceState* pState); - void FinishOrAbortTrace(TraceState* pState, bool aborted); - - Pal::Result CheckTraceDeviceChanged(TraceState* pState, Device* pNewDevice); - void DestroyRGPTracing(TraceState* pState); - Pal::Result InitRGPTracing(TraceState* pState, Device* pDevice); - Pal::Result InitTraceQueueResources(TraceState* pState, bool* pHasDebugVmid, const Queue* pQueue, bool auxQueue); - Pal::Result InitTraceQueueResourcesForDevice(TraceState* pState, bool* pHasDebugVmid); - Pal::Result InitTraceQueueFamilyResources(TraceState* pTraceState, TraceQueueFamilyState* pFamilyState); - void DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState); - TraceQueueState* FindTraceQueueState(TraceState* pState, const Queue* pQueue); - bool QueueSupportsTiming(uint32_t deviceIdx, const Queue* pQueue); + uint32_t postSizeLimit) = 0; - Instance* m_pInstance; - DevDriver::DevDriverServer* m_pDevDriverServer; - DevDriver::RGPProtocol::RGPServer* m_pRGPServer; - DevDriver::PipelineUriService* m_pPipelineUriService; - Util::Mutex m_traceMutex; - TraceState m_trace; - bool m_finalized; - TriggerMode m_triggerMode; // Current trigger mode for RGP frame trace - uint32_t m_numPrepFrames; - uint32_t m_traceGpuMemLimit; - bool m_enableInstTracing; // Enable instruction-level SQTT tokens - bool m_enableSampleUpdates; - bool m_allowComputePresents; - bool m_blockingTraceEnd; // Wait on trace-end fences immediately. - uint32_t m_globalFrameIndex; - uint64_t m_traceFrameBeginTag; - uint64_t m_traceFrameEndTag; - uint32_t m_traceFrameBeginIndex; - uint32_t m_traceFrameEndIndex; - uint64_t m_targetApiPsoHash; - uint32_t m_seMask; // Shader engine mask - bool m_perfCountersEnabled; // True if perf counters are enabled - uint64_t m_perfCounterMemLimit; // Memory limit for perf counters - uint32_t m_perfCounterFrequency; // Counter sample frequency - bool m_useStaticVmid; - bool m_staticVmidActive; - bool m_crashAnalysisEnabled; - - using PerfCounterList = Util::Vector; - - PerfCounterList m_perfCounterIds; // List of perf counter ids - - using PipelineCacheList = Util::List; - - PipelineCacheList m_pipelineCaches; - Util::RWLock m_pipelineReinjectionLock; - - PAL_DISALLOW_DEFAULT_CTOR(DevModeMgr); - PAL_DISALLOW_COPY_AND_ASSIGN(DevModeMgr); + virtual void DeregisterPipelineCache( + PipelineBinaryCache* pPipelineCache) = 0; #endif }; -#if ICD_GPUOPEN_DEVMODE_BUILD -// ===================================================================================================================== -// Returns true if queue operations are currently being timed by RGP traces. -inline bool DevModeMgr::IsQueueTimingActive( - const Device* pDevice - ) const -{ - return (m_trace.queueTimingEnabled && - (m_trace.status == TraceStatus::Running || - m_trace.status == TraceStatus::Preparing || - m_trace.status == TraceStatus::WaitingForSqtt) && - (pDevice->VkPhysicalDevice(DefaultDeviceIndex) == m_trace.pDevice->VkPhysicalDevice(DefaultDeviceIndex))); } -// ===================================================================================================================== -bool DevModeMgr::GetTraceFrameBeginTag( - uint64_t* pTag - ) const -{ - bool active; - - if (m_trace.status != TraceStatus::Idle) - { - *pTag = m_traceFrameBeginTag; - - active = true; - } - else - { - active = false; - } - - return active; -} - -// ===================================================================================================================== -bool DevModeMgr::GetTraceFrameEndTag( - uint64_t* pTag - ) const -{ - bool active; - - if (m_trace.status != TraceStatus::Idle) - { - *pTag = m_traceFrameEndTag; - - active = true; - } - else - { - active = false; - } - - return active; -} - -#endif -}; - #endif /* __DEVMODE_DEVMODE_MGR_H__ */ diff --git a/icd/api/devmode/devmode_mgr.cpp b/icd/api/devmode/devmode_rgp.cpp similarity index 95% rename from icd/api/devmode/devmode_mgr.cpp rename to icd/api/devmode/devmode_rgp.cpp index ee35ea3b..1006a647 100644 --- a/icd/api/devmode/devmode_mgr.cpp +++ b/icd/api/devmode/devmode_rgp.cpp @@ -24,14 +24,14 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file devmode_mgr.cpp - * @brief Contains implementation of the GPU Open Developer Mode manager + * @file devmode_rgp.cpp + * @brief Contains RGP implementation of the GPU Open Developer Mode manager *********************************************************************************************************************** */ #if ICD_GPUOPEN_DEVMODE_BUILD // Vulkan headers -#include "devmode/devmode_mgr.h" +#include "devmode/devmode_rgp.h" #include "include/vk_cmdbuffer.h" #include "include/vk_instance.h" #include "include/vk_pipeline.h" @@ -69,8 +69,6 @@ namespace vk { -constexpr uint64_t InfiniteTimeout = static_cast(1e10); - // ===================================================================================================================== // Translates a DevDriver result to a VkResult. static VkResult DevDriverToVkResult( @@ -136,13 +134,13 @@ static DevDriver::Result GetPipelineHashes( void* pUserData, DevDriver::ExclusionFlags /*flags*/) { - DevModeMgr* pDevModeMgr = static_cast(pUserData); + DevModeRgp* pDevModeRgp = static_cast(pUserData); DevDriver::Result result = DevDriver::Result::NotReady; - Util::RWLockAuto cacheListLock(pDevModeMgr->GetPipelineReinjectionLock()); + Util::RWLockAuto cacheListLock(pDevModeRgp->GetPipelineReinjectionLock()); - auto pipelineCacheIter = pDevModeMgr->GetPipelineCacheListIterator(); + auto pipelineCacheIter = pDevModeRgp->GetPipelineCacheListIterator(); while (pipelineCacheIter.Get() != nullptr) { @@ -185,13 +183,13 @@ static DevDriver::Result GetPipelineCodeObjects( const DevDriver::PipelineHash* pPipelineHashes, size_t numHashes) { - DevModeMgr* pDevModeMgr = static_cast(pUserData); + DevModeRgp* pDevModeRgp = static_cast(pUserData); DevDriver::Result result = DevDriver::Result::NotReady; - Util::RWLockAuto cacheListLock(pDevModeMgr->GetPipelineReinjectionLock()); + Util::RWLockAuto cacheListLock(pDevModeRgp->GetPipelineReinjectionLock()); - auto pipelineCacheIter = pDevModeMgr->GetPipelineCacheListIterator(); + auto pipelineCacheIter = pDevModeRgp->GetPipelineCacheListIterator(); while (pipelineCacheIter.Get() != nullptr) { @@ -266,16 +264,16 @@ static DevDriver::Result InjectPipelineCodeObjects( void* pUserData, DevDriver::PipelineRecordsIterator& pipelineIter) { - DevModeMgr* pDevModeMgr = static_cast(pUserData); + DevModeRgp* pDevModeRgp = static_cast(pUserData); DevDriver::Result result = DevDriver::Result::NotReady; uint32_t replacedCount = 0u; DevDriver::PipelineRecord record; - Util::RWLockAuto cacheListLock(pDevModeMgr->GetPipelineReinjectionLock()); + Util::RWLockAuto cacheListLock(pDevModeRgp->GetPipelineReinjectionLock()); - auto pipelineCacheIter = pDevModeMgr->GetPipelineCacheListIterator(); + auto pipelineCacheIter = pDevModeRgp->GetPipelineCacheListIterator(); while (pipelineCacheIter.Get() != nullptr) { @@ -312,7 +310,8 @@ static DevDriver::Result InjectPipelineCodeObjects( } // ===================================================================================================================== -DevModeMgr::DevModeMgr(Instance* pInstance) +DevModeRgp::DevModeRgp( + Instance* pInstance) : m_pInstance(pInstance), m_pDevDriverServer(pInstance->PalPlatform()->GetDevDriverServer()), @@ -344,24 +343,24 @@ DevModeMgr::DevModeMgr(Instance* pInstance) } // ===================================================================================================================== -DevModeMgr::~DevModeMgr() +DevModeRgp::~DevModeRgp() { DestroyRGPTracing(&m_trace); } // ===================================================================================================================== // Creates the GPU Open Developer Mode manager class. -VkResult DevModeMgr::Create( +VkResult DevModeRgp::Create( Instance* pInstance, - DevModeMgr** ppObject) + DevModeRgp** ppObject) { Pal::Result result = Pal::Result::Success; - void* pStorage = pInstance->AllocMem(sizeof(DevModeMgr), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + void* pStorage = pInstance->AllocMem(sizeof(DevModeRgp), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (pStorage != nullptr) { - DevModeMgr* pMgr = VK_PLACEMENT_NEW(pStorage) DevModeMgr(pInstance); + DevModeRgp* pMgr = VK_PLACEMENT_NEW(pStorage) DevModeRgp(pInstance); result = pMgr->Init(); @@ -384,7 +383,7 @@ VkResult DevModeMgr::Create( // ===================================================================================================================== // Initializes the devmode manager based on the current client flags. -Pal::Result DevModeMgr::Init() +Pal::Result DevModeRgp::Init() { Pal::Result result = Pal::Result::Success; @@ -400,7 +399,7 @@ Pal::Result DevModeMgr::Init() // Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit(). // // This finalizes the developer driver manager. -void DevModeMgr::Finalize( +void DevModeRgp::Finalize( uint32_t deviceCount, VulkanSettingsLoader* settingsLoaders[]) { @@ -437,7 +436,7 @@ void DevModeMgr::Finalize( // ===================================================================================================================== // Destroy the developer mode manager -void DevModeMgr::Destroy() +void DevModeRgp::Destroy() { Util::Destructor(this); @@ -446,7 +445,7 @@ void DevModeMgr::Destroy() // ===================================================================================================================== // Waits for the driver to be resumed if it's currently paused. -void DevModeMgr::WaitForDriverResume() +void DevModeRgp::WaitForDriverResume() { auto* pDriverControlServer = m_pDevDriverServer->GetDriverControlServer(); @@ -458,7 +457,7 @@ void DevModeMgr::WaitForDriverResume() // Called to notify of a frame-end boundary and is used to coordinate RGP trace start/stop. // // "delimiterType" represents how the transition/notify was triggered. -void DevModeMgr::NotifyFrameEnd( +void DevModeRgp::NotifyFrameEnd( const Queue* pQueue, FrameDelimiterType delimiterType) { @@ -509,7 +508,7 @@ void DevModeMgr::NotifyFrameEnd( } // ===================================================================================================================== -void DevModeMgr::AdvanceActiveTraceStep( +void DevModeRgp::AdvanceActiveTraceStep( TraceState* pState, const Queue* pQueue, bool beginFrame, @@ -588,7 +587,7 @@ void DevModeMgr::AdvanceActiveTraceStep( // Checks if all trace results are ready and finalizes the results, transmitting data through gpuopen. // // Transitions from Ending to Idle step. -Pal::Result DevModeMgr::TraceEndingToIdleStep(TraceState* pState) +Pal::Result DevModeRgp::TraceEndingToIdleStep(TraceState* pState) { VK_ASSERT(pState->status == TraceStatus::Ending); @@ -596,7 +595,8 @@ Pal::Result DevModeMgr::TraceEndingToIdleStep(TraceState* pState) if (m_blockingTraceEnd) { - result = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences(1, &pState->pEndFence, true, InfiniteTimeout); + result = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences( + 1, &pState->pEndFence, true, std::chrono::nanoseconds::max()); if (result != Pal::Result::Success) { @@ -677,7 +677,7 @@ Pal::Result DevModeMgr::TraceEndingToIdleStep(TraceState* pState) // Notifies of a frame-begin boundary and is used to coordinate RGP trace start/stop. // // "delimiterType" represents how the transition/notify was triggered. -void DevModeMgr::NotifyFrameBegin( +void DevModeRgp::NotifyFrameBegin( const Queue* pQueue, FrameDelimiterType delimiterType) { @@ -720,7 +720,7 @@ void DevModeMgr::NotifyFrameBegin( // ===================================================================================================================== // Returns the queue state for this aparticular queue. -DevModeMgr::TraceQueueState* DevModeMgr::FindTraceQueueState( +DevModeRgp::TraceQueueState* DevModeRgp::FindTraceQueueState( TraceState* pState, const Queue* pQueue) { @@ -758,7 +758,7 @@ DevModeMgr::TraceQueueState* DevModeMgr::FindTraceQueueState( // ===================================================================================================================== // Called from tracing layer before any queue submits any work. -void DevModeMgr::NotifyPreSubmit() +void DevModeRgp::NotifyPreSubmit() { // Check for pending traces here. TraceIdleToPendingStep(&m_trace); @@ -769,7 +769,7 @@ void DevModeMgr::NotifyPreSubmit() // each command buffer submit by the tracing layer and should be very light-weight. // // This function moves the trace state from Idle to Pending. -void DevModeMgr::TraceIdleToPendingStep( +void DevModeRgp::TraceIdleToPendingStep( TraceState* pState) { // Double-checked lock to test if there is a trace pending. If so, extract its trace parameters. @@ -910,7 +910,7 @@ void DevModeMgr::TraceIdleToPendingStep( // "delimiterType" represents how the transition/notify was triggered. // // This function transitions from the Pending state to the Preparing state. -Pal::Result DevModeMgr::TracePendingToPreparingStep( +Pal::Result DevModeRgp::TracePendingToPreparingStep( TraceState* pState, const Queue* pQueue, FrameDelimiterType delimiterType) @@ -1208,7 +1208,7 @@ Pal::Result DevModeMgr::TracePendingToPreparingStep( // information command buffer which starts SQ thread tracing (SQTT). // // This function transitions from the Preparing state to the Running state. -Pal::Result DevModeMgr::TracePreparingToRunningStep( +Pal::Result DevModeRgp::TracePreparingToRunningStep( TraceState* pState, const Queue* pQueue) { @@ -1336,7 +1336,7 @@ Pal::Result DevModeMgr::TracePreparingToRunningStep( // This function submits the command buffer to stop SQTT tracing. Full tracing still continues. // // This function transitions from the Running state to the WaitingForSqtt state. -Pal::Result DevModeMgr::TraceRunningToWaitingForSqttStep( +Pal::Result DevModeRgp::TraceRunningToWaitingForSqttStep( TraceState* pState, const Queue* pQueue) { @@ -1446,7 +1446,7 @@ Pal::Result DevModeMgr::TraceRunningToWaitingForSqttStep( // This function ends a running RGP trace. // // This function transitions from the WaitingForSqtt state to WaitingForResults state. -Pal::Result DevModeMgr::TraceWaitingForSqttToEndingStep( +Pal::Result DevModeRgp::TraceWaitingForSqttToEndingStep( TraceState* pState, const Queue* pQueue) { @@ -1457,7 +1457,8 @@ Pal::Result DevModeMgr::TraceWaitingForSqttToEndingStep( if (fenceResult == Pal::Result::NotReady && m_blockingTraceEnd) { - fenceResult = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences(1, &pState->pEndSqttFence, true, InfiniteTimeout); + fenceResult = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences( + 1, &pState->pEndSqttFence, true, std::chrono::nanoseconds::max()); } // Return without advancing if not ready yet or submit failed @@ -1556,7 +1557,7 @@ Pal::Result DevModeMgr::TraceWaitingForSqttToEndingStep( // ===================================================================================================================== // This function resets and possibly cancels a currently active (between begin/end) RGP trace. It frees any dependent // resources. -void DevModeMgr::FinishOrAbortTrace( +void DevModeRgp::FinishOrAbortTrace( TraceState* pState, bool aborted) { @@ -1600,7 +1601,7 @@ void DevModeMgr::FinishOrAbortTrace( // ===================================================================================================================== // This function will reinitialize RGP tracing resources that are reused between traces if the new trace device // has changed since the last trace. -Pal::Result DevModeMgr::CheckTraceDeviceChanged( +Pal::Result DevModeRgp::CheckTraceDeviceChanged( TraceState* pState, Device* pNewDevice) { @@ -1630,7 +1631,8 @@ Pal::Result DevModeMgr::CheckTraceDeviceChanged( // ===================================================================================================================== // Destroys device-persistent RGP resources for a particular queue family -void DevModeMgr::DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState) +void DevModeRgp::DestroyTraceQueueFamilyResources( + TraceQueueFamilyState* pState) { if (pState->pTraceBeginCmdBuf != nullptr) { @@ -1670,7 +1672,8 @@ void DevModeMgr::DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState) // ===================================================================================================================== // Destroys device-persistent RGP resources -void DevModeMgr::DestroyRGPTracing(TraceState* pState) +void DevModeRgp::DestroyRGPTracing( + TraceState* pState) { if (pState->status != TraceStatus::Idle) { @@ -1725,7 +1728,7 @@ void DevModeMgr::DestroyRGPTracing(TraceState* pState) // // If "auxQueue" is true, then the queue provided does not belong to the tracing logical device, but belongs to the // same physical device (and thus, the same PAL device) -Pal::Result DevModeMgr::InitTraceQueueResources( +Pal::Result DevModeRgp::InitTraceQueueResources( TraceState* pState, bool* pHasDebugVmid, const Queue* pQueue, @@ -1831,7 +1834,7 @@ Pal::Result DevModeMgr::InitTraceQueueResources( // ===================================================================================================================== // This function finds out all the queues in the device that we have to synchronize for RGP-traced frames and // initializes resources for them. -Pal::Result DevModeMgr::InitTraceQueueResourcesForDevice( +Pal::Result DevModeRgp::InitTraceQueueResourcesForDevice( TraceState* pState, bool* pHasDebugVmid) { @@ -1868,7 +1871,7 @@ Pal::Result DevModeMgr::InitTraceQueueResourcesForDevice( // ===================================================================================================================== // This function initializes the queue family -specific resources to support RGP tracing for a particular queue family -Pal::Result DevModeMgr::InitTraceQueueFamilyResources( +Pal::Result DevModeRgp::InitTraceQueueFamilyResources( TraceState* pTraceState, TraceQueueFamilyState* pFamilyState) { @@ -2037,7 +2040,7 @@ Pal::Result DevModeMgr::InitTraceQueueFamilyResources( // ===================================================================================================================== // Initializes device-persistent RGP resources -Pal::Result DevModeMgr::InitRGPTracing( +Pal::Result DevModeRgp::InitRGPTracing( TraceState* pState, Device* pDevice) { @@ -2250,7 +2253,7 @@ Pal::Result DevModeMgr::InitRGPTracing( // ===================================================================================================================== // Called when a new device is created. This will preallocate reusable RGP trace resources for that device. -void DevModeMgr::PostDeviceCreate(Device* pDevice) +void DevModeRgp::PostDeviceCreate(Device* pDevice) { Util::MutexAuto lock(&m_traceMutex); @@ -2273,7 +2276,8 @@ void DevModeMgr::PostDeviceCreate(Device* pDevice) // ===================================================================================================================== // Called prior to a device's being destroyed. This will free persistent RGP trace resources for that device. -void DevModeMgr::PreDeviceDestroy(Device* pDevice) +void DevModeRgp::PreDeviceDestroy( + Device* pDevice) { Util::MutexAuto lock(&m_traceMutex); @@ -2285,7 +2289,7 @@ void DevModeMgr::PreDeviceDestroy(Device* pDevice) } // ===================================================================================================================== -bool DevModeMgr::QueueSupportsTiming( +bool DevModeRgp::QueueSupportsTiming( uint32_t deviceIdx, const Queue* pQueue) { @@ -2311,7 +2315,7 @@ bool DevModeMgr::QueueSupportsTiming( } // ===================================================================================================================== -Pal::Result DevModeMgr::TimedSignalQueueSemaphore( +Pal::Result DevModeRgp::TimedSignalQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, @@ -2341,7 +2345,7 @@ Pal::Result DevModeMgr::TimedSignalQueueSemaphore( } // ===================================================================================================================== -Pal::Result DevModeMgr::TimedWaitQueueSemaphore( +Pal::Result DevModeRgp::TimedWaitQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, @@ -2371,7 +2375,7 @@ Pal::Result DevModeMgr::TimedWaitQueueSemaphore( } // ===================================================================================================================== -bool DevModeMgr::IsTracingEnabled() const +bool DevModeRgp::IsTracingEnabled() const { VK_ASSERT(m_finalized); @@ -2386,7 +2390,7 @@ bool DevModeMgr::IsTracingEnabled() const } // ===================================================================================================================== -Pal::Result DevModeMgr::TimedQueueSubmit( +Pal::Result DevModeRgp::TimedQueueSubmit( uint32_t deviceIdx, Queue* pQueue, uint32_t cmdBufferCount, @@ -2467,7 +2471,7 @@ Pal::Result DevModeMgr::TimedQueueSubmit( // ===================================================================================================================== // Registers this pipeline, storing the code object binary and recording a load event in the RGP trace. -void DevModeMgr::PipelineCreated( +void DevModeRgp::PipelineCreated( Device* pDevice, Pipeline* pPipeline) { @@ -2510,7 +2514,7 @@ void DevModeMgr::PipelineCreated( // ===================================================================================================================== // Unregisters this pipeline, recording an unload event in the RGP trace. -void DevModeMgr::PipelineDestroyed( +void DevModeRgp::PipelineDestroyed( Device* pDevice, Pipeline* pPipeline) { @@ -2555,7 +2559,7 @@ void DevModeMgr::PipelineDestroyed( // ===================================================================================================================== // Registers the shader libraries under this pipeline so the contents of each library can be written into the RGP // trace file. -void DevModeMgr::ShaderLibrariesCreated( +void DevModeRgp::ShaderLibrariesCreated( Device* pDevice, RayTracingPipeline* pPipeline) { @@ -2573,7 +2577,7 @@ void DevModeMgr::ShaderLibrariesCreated( // ===================================================================================================================== // Unregisters the shader libraries under this pipeline, recording an unload event in the RGP trace. -void DevModeMgr::ShaderLibrariesDestroyed( +void DevModeRgp::ShaderLibrariesDestroyed( Device* pDevice, RayTracingPipeline* pPipeline) { @@ -2591,7 +2595,7 @@ void DevModeMgr::ShaderLibrariesDestroyed( // ===================================================================================================================== // Retrieves the target API PSO hash from the RGP Server -uint64_t DevModeMgr::GetInstructionTraceTargetHash() +uint64_t DevModeRgp::GetInstructionTraceTargetHash() { uint64_t targetHash = InvalidTargetPipelineHash; @@ -2610,7 +2614,7 @@ uint64_t DevModeMgr::GetInstructionTraceTargetHash() // ===================================================================================================================== // Starts instruction trace -void DevModeMgr::StartInstructionTrace( +void DevModeRgp::StartInstructionTrace( CmdBuffer* pCmdBuffer) { if (IsTracingEnabled()) @@ -2624,7 +2628,7 @@ void DevModeMgr::StartInstructionTrace( // ===================================================================================================================== // Stops instruction trace -void DevModeMgr::StopInstructionTrace( +void DevModeRgp::StopInstructionTrace( CmdBuffer* pCmdBuffer) { if (IsTracingEnabled()) @@ -2639,7 +2643,7 @@ void DevModeMgr::StopInstructionTrace( // ===================================================================================================================== // Registers a pipeline binary cache object with the pipeline URI service and initializes the pipeline URI service // the first time a pipeline binary cache object is registered -Util::Result DevModeMgr::RegisterPipelineCache( +Util::Result DevModeRgp::RegisterPipelineCache( PipelineBinaryCache* pPipelineCache, uint32_t postSizeLimit) { @@ -2693,7 +2697,7 @@ Util::Result DevModeMgr::RegisterPipelineCache( // ===================================================================================================================== // Deregisters a pipeline binary cache with the pipeline URI service -void DevModeMgr::DeregisterPipelineCache( +void DevModeRgp::DeregisterPipelineCache( PipelineBinaryCache* pPipelineCache) { Util::RWLockAuto readWriteLock(&m_pipelineReinjectionLock); @@ -2718,6 +2722,60 @@ void DevModeMgr::DeregisterPipelineCache( } } +// ===================================================================================================================== +bool DevModeRgp::IsQueueTimingActive( + const Device* pDevice + ) const +{ + return (m_trace.queueTimingEnabled && + (m_trace.status == TraceStatus::Running || + m_trace.status == TraceStatus::Preparing || + m_trace.status == TraceStatus::WaitingForSqtt) && + (pDevice->VkPhysicalDevice(DefaultDeviceIndex) == m_trace.pDevice->VkPhysicalDevice(DefaultDeviceIndex))); +} + +// ===================================================================================================================== +bool DevModeRgp::GetTraceFrameBeginTag( + uint64_t* pTag + ) const +{ + bool active; + + if (m_trace.status != TraceStatus::Idle) + { + *pTag = m_traceFrameBeginTag; + + active = true; + } + else + { + active = false; + } + + return active; +} + +// ===================================================================================================================== +bool DevModeRgp::GetTraceFrameEndTag( + uint64_t* pTag + ) const +{ + bool active; + + if (m_trace.status != TraceStatus::Idle) + { + *pTag = m_traceFrameEndTag; + + active = true; + } + else + { + active = false; + } + + return active; +} + }; // namespace vk #endif diff --git a/icd/api/devmode/devmode_rgp.h b/icd/api/devmode/devmode_rgp.h new file mode 100644 index 00000000..d24eb82e --- /dev/null +++ b/icd/api/devmode/devmode_rgp.h @@ -0,0 +1,320 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file devmode_rgp.h +* @brief Contains the RGP implementation of the GPU Open Developer Mode (DevModeRgp) +*********************************************************************************************************************** +*/ + +#ifndef __DEVMODE_DEVMODE_RGP_H__ +#define __DEVMODE_DEVMODE_RGP_H__ + +#pragma once + +#include "devmode/devmode_mgr.h" + +// PAL headers +#include "palVector.h" + +// gpuutil headers +#include "gpuUtil/palGpaSession.h" +#if ICD_GPUOPEN_DEVMODE_BUILD +// gpuopen headers +#include "gpuopen.h" +#endif + +// PAL forward declarations +namespace Pal +{ +class ICmdBuffer; +class IFence; +class IQueueSemaphore; +struct PalPublicSettings; +} + +// DevDriver forward declarations +namespace DevDriver +{ +class DevDriverServer; +class PipelineUriService; +namespace RGPProtocol +{ +class RGPServer; +} +} + +namespace vk +{ + +// ===================================================================================================================== +// This class provides functionality to interact with the GPU Open Developer Mode message passing service and the rest +// of the driver. +class DevModeRgp final : public IDevMode +{ +#if ICD_GPUOPEN_DEVMODE_BUILD +public: + // Number of frames to wait before collecting a hardware trace. + // Note: This will be replaced in the future by a remotely configurable value provided by the RGP server. + static constexpr uint32_t NumTracePreparationFrames = 4; + + ~DevModeRgp(); + + static VkResult Create(Instance* pInstance, DevModeRgp** ppObject); + + virtual void Finalize( + uint32_t deviceCount, + VulkanSettingsLoader* settingsLoaders[]) override; + + virtual void Destroy() override; + + virtual void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void WaitForDriverResume() override; + virtual void PipelineCreated(Device* pDevice, Pipeline* pPipeline) override; + virtual void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline) override; +#if VKI_RAY_TRACING + virtual void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline) override; + virtual void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline) override; +#endif + virtual void PostDeviceCreate(Device* pDevice) override; + virtual void PreDeviceDestroy(Device* pDevice) override; + virtual void NotifyPreSubmit() override; + + virtual uint64_t GetInstructionTraceTargetHash() override; + virtual void StartInstructionTrace(CmdBuffer* pCmdBuffer) override; + virtual void StopInstructionTrace(CmdBuffer* pCmdBuffer) override; + + virtual bool IsTracingEnabled() const override; + virtual bool IsCrashAnalysisEnabled() const override { return m_crashAnalysisEnabled; } + + virtual Pal::Result TimedQueueSubmit( + uint32_t deviceIdx, + Queue* pQueue, + uint32_t cmdBufferCount, + const VkCommandBuffer* pCommandBuffers, + const Pal::SubmitInfo& submitInfo, + VirtualStackFrame* pVirtStackFrame) override; + + virtual Pal::Result TimedSignalQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + virtual Pal::Result TimedWaitQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + virtual bool IsQueueTimingActive(const Device* pDevice) const override; + virtual bool GetTraceFrameBeginTag(uint64_t* pTag) const override; + virtual bool GetTraceFrameEndTag(uint64_t* pTag) const override; + + virtual Util::Result RegisterPipelineCache( + PipelineBinaryCache* pPipelineCache, + uint32_t postSizeLimit) override; + + virtual void DeregisterPipelineCache( + PipelineBinaryCache* pPipelineCache) override; + + Util::ListIterator GetPipelineCacheListIterator() + { return m_pipelineCaches.Begin(); } + + Util::RWLock* GetPipelineReinjectionLock() + { return &m_pipelineReinjectionLock; } + +private: + static constexpr uint32_t MaxTraceQueueFamilies = Queue::MaxQueueFamilies; + static constexpr uint32_t MaxTraceQueues = MaxTraceQueueFamilies * Queue::MaxQueuesPerFamily; + + // Various trigger modes supported for RGP traces + enum class TriggerMode : uint32_t + { + Present = 0, // Traces triggered by presents + Index, // Traces triggered by frame indices + Tag // Traces triggered by command buffer tags + }; + + // Steps that an RGP trace goes through + enum class TraceStatus : uint32_t + { + // "Pre-trace" stages: + Idle = 0, // No active trace and none requested + Pending, // We've identified that a trace has been requested and we've received its parameters, + // but we have not yet seen the first frame. + Preparing, // A trace has been requested but is not active yet because we are + // currently sampling timing information over some number of lead frames. + Running, // SQTT and queue timing is currently active for all command buffer submits. + + // "Post-trace" stages: + WaitingForSqtt, // Command to turn off SQTT has been submitted and we're waiting for fence confirmation. + Ending // Tracing is no longer active, but all results are not yet ready. + }; + + // Queue family (type)-specific state to support RGP tracing (part of device state) + struct TraceQueueFamilyState + { + uint32_t queueFamilyIndex; + Pal::QueueType queueType; + Pal::EngineType engineType; + Pal::ICmdBuffer* pTraceBeginCmdBuf; + Pal::ICmdBuffer* pTraceBeginSqttCmdBuf; + Pal::ICmdBuffer* pTraceEndSqttCmdBuf; + Pal::ICmdBuffer* pTraceEndCmdBuf; + Pal::ICmdBuffer* pTraceFlushCmdBuf; + bool supportsTracing; + bool usedForBegin; + bool usedForEndSqtt; + bool usedForEnd; + }; + + // Queue-specific resources to support RGP tracing (part of device state) + struct TraceQueueState + { + const Queue* pQueue; + TraceQueueFamilyState* pFamily; + Pal::uint64 queueId; + Pal::uint64 queueContext; + bool timingSupported; + }; + + // All per-device state to support RGP tracing + struct TraceState + { + TraceStatus status; // Current trace status (idle, running, etc.) + bool labelDelimsPresent; // True is a label delimiter is recieved + + Device* pDevice; // The device currently doing the tracing + Pal::ICmdAllocator* pCmdAllocator; // Command allocator for creating trace-begin/end buffers + Pal::IFence* pBeginFence; // Fence that is signaled when a trace-begin cmdbuf retires + Pal::IFence* pEndSqttFence; // Fence that is signaled when a trace-end cmdbuf retires + Pal::IFence* pEndFence; // Fence that is signaled when a trace-end cmdbuf retires + TraceQueueState* pTracePrepareQueue; // The queue that triggered the full start of a trace + TraceQueueState* pTraceBeginQueue; // The queue that triggered starting SQTT + TraceQueueState* pTraceEndSqttQueue; // The queue that triggered ending SQTT + TraceQueueState* pTraceEndQueue; // The queue that triggered the full end of a trace + + GpuUtil::GpaSession* pGpaSession; // GPA session helper object for building RGP data + uint32_t gpaSampleId; // Sample ID associated with the current trace + bool queueTimingEnabled; // Queue timing is enabled + bool flushAllQueues; // Flushes all queues during the last preparation frame. + + // Queue-specific state/information for tracing: + uint32_t queueCount; + TraceQueueState queueState[MaxTraceQueues]; + uint32_t auxQueueCount; + TraceQueueState auxQueueStates[MaxTraceQueues]; // Used for queues belonging to other logical devices + // pointing to the same physical device + uint32_t queueFamilyCount; + TraceQueueFamilyState queueFamilyState[MaxTraceQueueFamilies]; + + uint32_t activeCmdBufCount; // Number of command buffers in below list + Pal::ICmdBuffer* pActiveCmdBufs[4]; // List of command buffers that need to be reset at end of trace + uint32_t preparedFrameCount; // Number of frames counted while preparing for a trace + uint32_t sqttFrameCount; // Number of frames counted while SQTT tracing is active + uint64_t frameBeginTag; // If a command buffer with this debug-tag is submitted, it is + // treated as a virtual frame-start event. + uint64_t frameEndTag; // Similarly to above but for frame-end post-submit. + }; + + DevModeRgp(Instance* pInstance); + + Pal::Result Init(); + + Pal::Result CheckTraceDeviceChanged(TraceState* pState, Device* pNewDevice); + + Pal::Result InitRGPTracing(TraceState* pState, Device* pDevice); + void DestroyRGPTracing(TraceState* pState); + + Pal::Result InitTraceQueueResources(TraceState* pState, bool* pHasDebugVmid, const Queue* pQueue, bool auxQueue); + Pal::Result InitTraceQueueResourcesForDevice(TraceState* pState, bool* pHasDebugVmid); + Pal::Result InitTraceQueueFamilyResources(TraceState* pTraceState, TraceQueueFamilyState* pFamilyState); + void DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState); + TraceQueueState* FindTraceQueueState(TraceState* pState, const Queue* pQueue); + bool QueueSupportsTiming(uint32_t deviceIdx, const Queue* pQueue); + + // RGP trace state functionality + void AdvanceActiveTraceStep( + TraceState* pState, + const Queue* pQueue, + bool beginFrame, + FrameDelimiterType delimiterType); + void TraceIdleToPendingStep(TraceState* pState); + Pal::Result TracePendingToPreparingStep( + TraceState* pState, + const Queue* pQueue, + FrameDelimiterType delimiterType); + Pal::Result TracePreparingToRunningStep(TraceState* pState, const Queue* pQueue); + Pal::Result TraceRunningToWaitingForSqttStep(TraceState* pState, const Queue* pQueue); + Pal::Result TraceWaitingForSqttToEndingStep(TraceState* pState, const Queue* pQueue); + Pal::Result TraceEndingToIdleStep(TraceState* pState); + void FinishOrAbortTrace(TraceState* pState, bool aborted); + + Instance* m_pInstance; + DevDriver::DevDriverServer* m_pDevDriverServer; + DevDriver::RGPProtocol::RGPServer* m_pRGPServer; + DevDriver::PipelineUriService* m_pPipelineUriService; + Util::Mutex m_traceMutex; + TraceState m_trace; + bool m_finalized; + TriggerMode m_triggerMode; // Current trigger mode for RGP frame trace + uint32_t m_numPrepFrames; + uint32_t m_traceGpuMemLimit; + bool m_enableInstTracing; // Enable instruction-level SQTT tokens + bool m_enableSampleUpdates; + bool m_allowComputePresents; + bool m_blockingTraceEnd; // Wait on trace-end fences immediately. + uint32_t m_globalFrameIndex; + uint64_t m_traceFrameBeginTag; + uint64_t m_traceFrameEndTag; + uint32_t m_traceFrameBeginIndex; + uint32_t m_traceFrameEndIndex; + uint64_t m_targetApiPsoHash; + uint32_t m_seMask; // Shader engine mask + bool m_perfCountersEnabled; // True if perf counters are enabled + uint64_t m_perfCounterMemLimit; // Memory limit for perf counters + uint32_t m_perfCounterFrequency; // Counter sample frequency + bool m_useStaticVmid; + bool m_staticVmidActive; + bool m_crashAnalysisEnabled; + + using PerfCounterList = Util::Vector; + + PerfCounterList m_perfCounterIds; // List of perf counter ids + + using PipelineCacheList = Util::List; + + PipelineCacheList m_pipelineCaches; + Util::RWLock m_pipelineReinjectionLock; +#endif +}; + +} + +#endif /* __DEVMODE_DEVMODE_RGP_H__ */ diff --git a/icd/api/devmode/devmode_ubertrace.cpp b/icd/api/devmode/devmode_ubertrace.cpp new file mode 100644 index 00000000..0c55ce31 --- /dev/null +++ b/icd/api/devmode/devmode_ubertrace.cpp @@ -0,0 +1,577 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file devmode_ubertrace.cpp + * @brief Contains UberTrace implementation of the GPU Open Developer Mode manager + *********************************************************************************************************************** + */ + +#if ICD_GPUOPEN_DEVMODE_BUILD +// Vulkan headers +#include "devmode/devmode_ubertrace.h" +#include "include/vk_cmdbuffer.h" +#include "include/vk_instance.h" +#include "include/vk_pipeline.h" +#include "include/vk_graphics_pipeline.h" +#include "include/vk_graphics_pipeline_library.h" +#include "include/vk_physical_device.h" +#include "include/vk_utils.h" +#include "include/vk_conv.h" +#include "include/pipeline_binary_cache.h" +#include "sqtt/sqtt_layer.h" +#include "sqtt/sqtt_mgr.h" + +// PAL headers +#include "pal.h" +#include "palCodeObjectTraceSource.h" +#include "palQueueTimingsTraceSource.h" + +// gpuopen headers +#include "devDriverServer.h" +#include "msgChannel.h" +#include "msgTransport.h" +#include "protocols/driverControlServer.h" +#include "protocols/ddPipelineUriService.h" +#include "protocols/ddEventServer.h" + +#if VKI_RAY_TRACING +#include "raytrace/vk_ray_tracing_pipeline.h" +#endif + +namespace vk +{ + +// ===================================================================================================================== +DevModeUberTrace::DevModeUberTrace( + Instance* pInstance) + : + m_pInstance(pInstance), + m_pDevDriverServer(pInstance->PalPlatform()->GetDevDriverServer()), + m_finalized(false), + m_crashAnalysisEnabled(false), + m_globalFrameIndex(1), // Must start from 1 according to RGP spec + m_pTraceSession(pInstance->PalPlatform()->GetTraceSession()), + m_pCodeObjectTraceSource(nullptr), + m_pQueueTimingsTraceSource(nullptr) +{ +} + +// ===================================================================================================================== +DevModeUberTrace::~DevModeUberTrace() +{ + DestroyUberTraceResources(); +} + +// ===================================================================================================================== +// Creates the UberTrace GPU Open Developer Mode manager class. +VkResult DevModeUberTrace::Create( + Instance* pInstance, + DevModeUberTrace** ppObject) +{ + Pal::Result result = Pal::Result::Success; + + void* pStorage = pInstance->AllocMem(sizeof(DevModeUberTrace), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pStorage != nullptr) + { + DevModeUberTrace* pMgr = VK_PLACEMENT_NEW(pStorage) DevModeUberTrace(pInstance); + + if (result == Pal::Result::Success) + { + *ppObject = pMgr; + } + else + { + pMgr->Destroy(); + } + } + else + { + result = Pal::Result::ErrorOutOfMemory; + } + + return PalToVkResult(result); +} + +// ===================================================================================================================== +void DevModeUberTrace::Finalize( + uint32_t deviceCount, + VulkanSettingsLoader* settingsLoaders[]) +{ + m_pDevDriverServer->GetDriverControlServer()->StartLateDeviceInit(); + + // Finalize the devmode manager + m_pDevDriverServer->Finalize(); + + m_crashAnalysisEnabled = m_pInstance->PalPlatform()->IsCrashAnalysisModeEnabled(); + + m_finalized = true; +} + +// ===================================================================================================================== +void DevModeUberTrace::Destroy() +{ + Util::Destructor(this); + m_pInstance->FreeMem(this); +} + +// ===================================================================================================================== +void DevModeUberTrace::NotifyFrameBegin( + const Queue* pQueue, + FrameDelimiterType delimiterType) +{ + // Wait for the driver to be resumed in case it's been paused. + WaitForDriverResume(); + + m_pInstance->PalPlatform()->UpdateFrameTraceController(pQueue->PalQueue(DefaultDeviceIndex)); +} + +// ===================================================================================================================== +void DevModeUberTrace::NotifyFrameEnd( + const Queue* pQueue, + FrameDelimiterType delimiterType) +{ + if (IsQueueTimingActive(pQueue->VkDevice())) + { + // Call TimedQueuePresent() to insert commands that collect GPU timestamp. + Pal::IQueue* pPalQueue = pQueue->PalQueue(DefaultDeviceIndex); + + // Currently nothing in the PresentInfo struct is used for inserting a timed present marker. + GpuUtil::TimedQueuePresentInfo timedPresentInfo = {}; + Pal::Result result = m_pQueueTimingsTraceSource->TimedQueuePresent(pPalQueue, timedPresentInfo); + + VK_ASSERT(result == Pal::Result::Success); + } + + m_globalFrameIndex++; +} + +// ===================================================================================================================== +// Waits for the driver to be resumed if it's currently paused. +void DevModeUberTrace::WaitForDriverResume() +{ + auto* pDriverControlServer = m_pDevDriverServer->GetDriverControlServer(); + + VK_ASSERT(pDriverControlServer != nullptr); + pDriverControlServer->DriverTick(); +} + +// ===================================================================================================================== +void DevModeUberTrace::PipelineCreated( + Device* pDevice, + Pipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + GpuUtil::RegisterPipelineInfo pipelineInfo = { 0 }; + pipelineInfo.apiPsoHash = pPipeline->GetApiHash(); + if (pPipeline->PalPipeline(DefaultDeviceIndex) != nullptr) + { + bool isGplPipeline = false; + GraphicsPipeline* pGraphicsPipeline = nullptr; + if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_GRAPHICS) + { + pGraphicsPipeline = reinterpret_cast(pPipeline); + isGplPipeline = pGraphicsPipeline->GetPalShaderLibrary(GraphicsLibraryPreRaster) != nullptr; + } + + if (isGplPipeline) + { + GpuUtil::RegisterLibraryInfo libInfo = { pipelineInfo.apiPsoHash }; + for (uint32_t i = 0; i < GraphicsLibraryCount; i++) + { + const Pal::IShaderLibrary* pLib = + pGraphicsPipeline->GetPalShaderLibrary(static_cast(i)); + if (pLib != nullptr) + { + m_pCodeObjectTraceSource->RegisterLibrary(pLib, libInfo); + } + } + } + else + { + m_pCodeObjectTraceSource->RegisterPipeline(pPipeline->PalPipeline(DefaultDeviceIndex), pipelineInfo); + } + } + } +} + +// ===================================================================================================================== +void DevModeUberTrace::PipelineDestroyed( + Device* pDevice, + Pipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + if (pPipeline->PalPipeline(DefaultDeviceIndex) != nullptr) + { + bool isGplPipeline = false; + if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_GRAPHICS) + { + GraphicsPipeline* pGraphicsPipeline = reinterpret_cast(pPipeline); + isGplPipeline = pGraphicsPipeline->GetPalShaderLibrary(GraphicsLibraryPreRaster) != nullptr; + } + + if (isGplPipeline == false) + { + m_pCodeObjectTraceSource->UnregisterPipeline(pPipeline->PalPipeline(DefaultDeviceIndex)); + } + } + else + { + if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_GRAPHICS) + { + GraphicsPipelineLibrary* pGraphicsLibrary = reinterpret_cast(pPipeline); + const Pal::IShaderLibrary* pPalLibraries[GraphicsLibraryCount] = {}; + pGraphicsLibrary->GetOwnedPalShaderLibraries(pPalLibraries); + for (uint32_t i = 0; i < GraphicsLibraryCount; i++) + { + if (pPalLibraries[i] != nullptr) + { + m_pCodeObjectTraceSource->UnregisterLibrary(pPalLibraries[i]); + } + } + } + } + } +} + +#if VKI_RAY_TRACING +// ===================================================================================================================== +void DevModeUberTrace::ShaderLibrariesCreated( + Device* pDevice, + RayTracingPipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + for (uint32_t i = 0; i < pPipeline->GetShaderLibraryCount(); ++i) + { + GpuUtil::RegisterLibraryInfo pipelineInfo = { pPipeline->GetApiHash() }; + m_pCodeObjectTraceSource->RegisterLibrary(pPipeline->PalShaderLibrary(i), pipelineInfo); + } + } +} + +// ===================================================================================================================== +void DevModeUberTrace::ShaderLibrariesDestroyed( + Device* pDevice, + RayTracingPipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + for (uint32_t i = 0; i < pPipeline->GetShaderLibraryCount(); ++i) + { + m_pCodeObjectTraceSource->UnregisterLibrary(pPipeline->PalShaderLibrary(i)); + } + } +} +#endif + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::RegisterQueuesForDevice( + Device* pDevice) +{ + Pal::Result result = Pal::Result::Success; + + for (uint32_t familyIdx = 0; familyIdx < Queue::MaxQueueFamilies; ++familyIdx) + { + for (uint32_t queueIdx = 0; + (queueIdx < Queue::MaxQueuesPerFamily) && (result == Pal::Result::Success); + ++queueIdx) + { + VkQueue queueHandle = VK_NULL_HANDLE; + pDevice->GetQueue(familyIdx, queueIdx, &queueHandle); + + if (queueHandle != VK_NULL_HANDLE) + { + Queue* pQueue = ApiQueue::ObjectFromHandle(queueHandle); + Pal::IQueue* pPalQueue = pQueue->PalQueue(DefaultDeviceIndex); + + // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients; + // it may be optional for Vulkan, but we provide it anyway if available). + Pal::KernelContextInfo kernelCxtInfo = {}; + Pal::Result resultQueryKernel = pPalQueue->QueryKernelContextInfo(&kernelCxtInfo); + + uint64_t queueId = reinterpret_cast(ApiQueue::FromObject(pQueue)); + uint64_t queueContext = (resultQueryKernel == Pal::Result::Success) + ? kernelCxtInfo.contextIdentifier + : 0; + + result = m_pQueueTimingsTraceSource->RegisterTimedQueue(pPalQueue, queueId, queueContext); + } + } + } + + return result; +} + +// ===================================================================================================================== +void DevModeUberTrace::PostDeviceCreate( + Device* pDevice) +{ + Pal::Result result = InitUberTraceResources(pDevice->PalDevice(DefaultDeviceIndex)); + + if (result == Pal::Result::Success) + { + result = RegisterQueuesForDevice(pDevice); + } + + VK_ASSERT(result == Pal::Result::Success); + + auto* pDriverControlServer = m_pDevDriverServer->GetDriverControlServer(); + + VK_ASSERT(pDriverControlServer != nullptr); + + // If the driver hasn't been marked as fully initialized yet, mark it now. We consider the time after the logical + // device creation to be the fully initialized driver position. This is mainly because PAL is fully initialized + // at this point and we also know whether or not the debug vmid has been acquired. External tools use this + // information to decide when it's reasonable to make certain requests of the driver through protocol functions. + if (pDriverControlServer->IsDriverInitialized() == false) + { + pDriverControlServer->FinishDeviceInit(); + } +} + +// ===================================================================================================================== +bool DevModeUberTrace::IsTracingEnabled() const +{ + return m_pTraceSession->IsTracingEnabled(); +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::TimedQueueSubmit( + uint32_t deviceIdx, + Queue* pQueue, + uint32_t cmdBufferCount, + const VkCommandBuffer* pCommandBuffers, + const Pal::SubmitInfo& submitInfo, + VirtualStackFrame* pVirtStackFrame) +{ + VK_ASSERT(cmdBufferCount == submitInfo.pPerSubQueueInfo[0].cmdBufferCount); + + bool timingSupported = IsQueueTimingActive(pQueue->VkDevice()) && (submitInfo.pPerSubQueueInfo[0].cmdBufferCount > 0); + + // Fill in extra meta-data information to associate the API command buffer data with the generated + // timing information. + GpuUtil::TimedSubmitInfo timedSubmitInfo = {}; + Pal::uint64* pApiCmdBufIds = nullptr; + Pal::uint32* pSqttCmdBufIds = nullptr; + + if (timingSupported) + { + pApiCmdBufIds = pVirtStackFrame->AllocArray(cmdBufferCount); + pSqttCmdBufIds = pVirtStackFrame->AllocArray(cmdBufferCount); + + timedSubmitInfo.pApiCmdBufIds = pApiCmdBufIds; + timedSubmitInfo.pSqttCmdBufIds = pSqttCmdBufIds; + timedSubmitInfo.frameIndex = m_globalFrameIndex; + + timingSupported &= (pApiCmdBufIds != nullptr) && (pSqttCmdBufIds != nullptr); + } + + Pal::Result result = Pal::Result::NotReady; + + Pal::IQueue* pPalQueue = pQueue->PalQueue(deviceIdx); + + if (timingSupported) + { + for (uint32_t cbIdx = 0; cbIdx < cmdBufferCount; ++cbIdx) + { + uintptr_t intHandle = reinterpret_cast(pCommandBuffers[cbIdx]); + + pApiCmdBufIds[cbIdx] = intHandle; + + CmdBuffer* pCmdBuf = ApiCmdBuffer::ObjectFromHandle(pCommandBuffers[cbIdx]); + + pSqttCmdBufIds[cbIdx] = 0; + + if (pCmdBuf->GetSqttState() != nullptr) + { + pSqttCmdBufIds[cbIdx] = pCmdBuf->GetSqttState()->GetId().u32All; + } + + VK_ASSERT(pCmdBuf->PalCmdBuffer(DefaultDeviceIndex) == submitInfo.pPerSubQueueInfo[0].ppCmdBuffers[cbIdx]); + } + + // Do a timed submit of all the command buffers + result = m_pQueueTimingsTraceSource->TimedSubmit(pPalQueue, submitInfo, timedSubmitInfo); + + VK_ASSERT(result == Pal::Result::Success); + } + + // Punt to non-timed submit if a timed submit fails (or is not supported) + if (result != Pal::Result::Success) + { + result = Queue::PalQueueSubmit(pQueue->VkDevice(), pPalQueue, submitInfo); + } + + if (pApiCmdBufIds != nullptr) + { + pVirtStackFrame->FreeArray(pApiCmdBufIds); + } + + if (pSqttCmdBufIds != nullptr) + { + pVirtStackFrame->FreeArray(pSqttCmdBufIds); + } + + return result; +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::TimedSignalQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) +{ + Pal::IQueue* pPalQueue = pQueue->PalQueue(deviceIdx); + + Pal::Result result = Pal::Result::NotReady; + + if (IsQueueTimingActive(pQueue->VkDevice())) + { + GpuUtil::TimedQueueSemaphoreInfo timedSemaphoreInfo = {}; + + timedSemaphoreInfo.semaphoreID = (uint64_t)semaphore; + result = m_pQueueTimingsTraceSource->TimedSignalQueueSemaphore(pPalQueue, pQueueSemaphore, timedSemaphoreInfo, value); + + VK_ASSERT(result == Pal::Result::Success); + } + + if (result != Pal::Result::Success) + { + result = pPalQueue->SignalQueueSemaphore(pQueueSemaphore, value); + } + + return result; +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::TimedWaitQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) +{ + Pal::IQueue* pPalQueue = pQueue->PalQueue(deviceIdx); + + Pal::Result result = Pal::Result::NotReady; + + if (IsQueueTimingActive(pQueue->VkDevice())) + { + GpuUtil::TimedQueueSemaphoreInfo timedSemaphoreInfo = {}; + + timedSemaphoreInfo.semaphoreID = (uint64_t)semaphore; + result = m_pQueueTimingsTraceSource->TimedWaitQueueSemaphore(pPalQueue, pQueueSemaphore, timedSemaphoreInfo, value); + + VK_ASSERT(result == Pal::Result::Success); + } + + if (result != Pal::Result::Success) + { + result = pPalQueue->WaitQueueSemaphore(pQueueSemaphore, value); + } + + return result; +} + +// ===================================================================================================================== +bool DevModeUberTrace::IsQueueTimingActive( + const Device* /*pDevice*/ + ) const +{ + return (m_pQueueTimingsTraceSource != nullptr) ? m_pQueueTimingsTraceSource->IsTimingInProgress() : false; +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::InitUberTraceResources( + Pal::IDevice* pPalDevice) +{ + Pal::Result result = Pal::Result::ErrorOutOfMemory; + + void* pStorage = m_pInstance->AllocMem(sizeof(GpuUtil::CodeObjectTraceSource), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pStorage != nullptr) + { + m_pCodeObjectTraceSource = VK_PLACEMENT_NEW(pStorage) + GpuUtil::CodeObjectTraceSource(m_pInstance->PalPlatform()); + + result = m_pTraceSession->RegisterSource(m_pCodeObjectTraceSource); + } + + if (result == Pal::Result::Success) + { + pStorage = m_pInstance->AllocMem(sizeof(GpuUtil::QueueTimingsTraceSource), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pStorage != nullptr) + { + m_pQueueTimingsTraceSource = VK_PLACEMENT_NEW(pStorage) + GpuUtil::QueueTimingsTraceSource(m_pInstance->PalPlatform()); + + result = m_pTraceSession->RegisterSource(m_pQueueTimingsTraceSource); + } + else + { + result = Pal::Result::ErrorOutOfMemory; + } + } + + if (result == Pal::Result::Success) + { + result = m_pQueueTimingsTraceSource->Init(pPalDevice); + } + + if (result != Pal::Result::Success) + { + DestroyUberTraceResources(); + } + return result; +} + +// ===================================================================================================================== +void DevModeUberTrace::DestroyUberTraceResources() +{ + if (m_pCodeObjectTraceSource != nullptr) + { + m_pTraceSession->UnregisterSource(m_pCodeObjectTraceSource); + m_pInstance->FreeMem(m_pCodeObjectTraceSource); + m_pCodeObjectTraceSource = nullptr; + } + + if (m_pQueueTimingsTraceSource != nullptr) + { + m_pTraceSession->UnregisterSource(m_pQueueTimingsTraceSource); + m_pInstance->FreeMem(m_pQueueTimingsTraceSource); + m_pQueueTimingsTraceSource = nullptr; + } +} + +} // namespace vk + +#endif diff --git a/icd/api/devmode/devmode_ubertrace.h b/icd/api/devmode/devmode_ubertrace.h new file mode 100644 index 00000000..4710ca17 --- /dev/null +++ b/icd/api/devmode/devmode_ubertrace.h @@ -0,0 +1,148 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file devmode_ubertrace.h +* @brief Contains the UberTrace implementation of the GPU Open Developer Mode (DevModeUberTrace) +*********************************************************************************************************************** +*/ + +#ifndef __DEVMODE_DEVMODE_UBERTRACE_H__ +#define __DEVMODE_DEVMODE_UBERTRACE_H__ + +#pragma once + +#include "devmode/devmode_mgr.h" +#include "palTraceSession.h" + +// GPUOpen forward declarations +namespace DevDriver +{ +class DevDriverServer; +} + +namespace GpuUtil +{ +class CodeObjectTraceSource; +class QueueTimingsTraceSource; +} + +namespace vk +{ + +// ===================================================================================================================== +// This class provides functionality to interact with the GPU Open Developer Mode message passing service and the rest +// of the driver. +class DevModeUberTrace final : public IDevMode +{ +#if ICD_GPUOPEN_DEVMODE_BUILD +public: + ~DevModeUberTrace(); + + static VkResult Create(Instance* pInstance, DevModeUberTrace** ppObject); + + virtual void Finalize( + uint32_t deviceCount, + VulkanSettingsLoader* settingsLoaders[]) override; + + virtual void Destroy() override; + + virtual void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void WaitForDriverResume() override; + virtual void PipelineCreated(Device* pDevice, Pipeline* pPipeline) override; + virtual void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline) override; +#if VKI_RAY_TRACING + virtual void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline) override; + virtual void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline) override; +#endif + virtual void PostDeviceCreate(Device* pDevice) override; + virtual void PreDeviceDestroy(Device* pDevice) override { }; + virtual void NotifyPreSubmit() override { }; + + virtual bool IsTracingEnabled() const override; + virtual bool IsCrashAnalysisEnabled() const override { return m_crashAnalysisEnabled; } + virtual bool IsQueueTimingActive(const Device* pDevice) const override; + + virtual Pal::Result TimedQueueSubmit( + uint32_t deviceIdx, + Queue* pQueue, + uint32_t cmdBufferCount, + const VkCommandBuffer* pCommandBuffers, + const Pal::SubmitInfo& submitInfo, + VirtualStackFrame* pVirtStackFrame) override; + + virtual Pal::Result TimedSignalQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + virtual Pal::Result TimedWaitQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + // Deprecated functionality + virtual uint64_t GetInstructionTraceTargetHash() override { return InvalidTargetPipelineHash; }; + virtual void StartInstructionTrace(CmdBuffer* pCmdBuffer) override { }; + virtual void StopInstructionTrace(CmdBuffer* pCmdBuffer) override { }; + + virtual bool GetTraceFrameBeginTag(uint64_t* pTag) const override { return false; }; + virtual bool GetTraceFrameEndTag(uint64_t* pTag) const override { return false; }; + + virtual Util::Result RegisterPipelineCache( + PipelineBinaryCache* pPipelineCache, + uint32_t postSizeLimit) override { return Util::Result::Success; }; + + virtual void DeregisterPipelineCache( + PipelineBinaryCache* pPipelineCache) override { }; + +private: + DevModeUberTrace(Instance* pInstance); + + Pal::Result InitUberTraceResources(Pal::IDevice* pPalDevice); + void DestroyUberTraceResources(); + + Pal::Result RegisterQueuesForDevice(Device* pDevice); + + Instance* m_pInstance; + DevDriver::DevDriverServer* m_pDevDriverServer; + bool m_finalized; + bool m_crashAnalysisEnabled; + uint32_t m_globalFrameIndex; + + GpuUtil::TraceSession* m_pTraceSession; + GpuUtil::CodeObjectTraceSource* m_pCodeObjectTraceSource; + GpuUtil::QueueTimingsTraceSource* m_pQueueTimingsTraceSource; +#endif +}; + +} + +#endif /* __DEVMODE_DEVMODE_UBERTRACE_H__ */ diff --git a/icd/api/entry.cpp b/icd/api/entry.cpp index c45d695b..23728f08 100644 --- a/icd/api/entry.cpp +++ b/icd/api/entry.cpp @@ -374,6 +374,41 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect( ApiCmdBuffer::ObjectFromHandle(cmdBuffer)->DispatchIndirect(buffer, offset); } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdPreprocessGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo) +{ +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdExecuteGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo) +{ + ApiCmdBuffer::ObjectFromHandle(commandBuffer)->ExecuteIndirect(isPreprocessed, pGeneratedCommandsInfo); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdBindPipelineShaderGroupNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline, + uint32_t groupIndex) +{ + VK_NOT_IMPLEMENTED; +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdUpdatePipelineIndirectBufferNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline) +{ + VK_NOT_IMPLEMENTED; +} + // ===================================================================================================================== VKAPI_ATTR void VKAPI_CALL vkCmdCopyBuffer( VkCommandBuffer cmdBuffer, diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp index e5f5e0a2..7a036e53 100644 --- a/icd/api/graphics_pipeline_common.cpp +++ b/icd/api/graphics_pipeline_common.cpp @@ -199,11 +199,15 @@ static void BuildPalColorBlendStateCreateInfo( { uint32_t location = i; - if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i] != VK_ATTACHMENT_UNUSED)) + if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && + (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr)) { location = extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i]; + + if (location == VK_ATTACHMENT_UNUSED) + { + continue; + } } const VkPipelineColorBlendAttachmentState& attachmentState = pColorBlendState->pAttachments[i]; @@ -820,7 +824,7 @@ VkResult GraphicsPipelineCommon::Create( } else if (pDevice->GetRuntimeSettings().pipelineLinkOptimizationMode == PipelineLinkOptimizationAlwaysOptimized) { - flags |= ~VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT; + flags |= VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT; } if ((flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR) != 0) @@ -1642,11 +1646,15 @@ static void BuildColorBlendState( { uint32_t location = i; - if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i] != VK_ATTACHMENT_UNUSED)) + if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && + (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr)) { location = extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i]; + + if (location == VK_ATTACHMENT_UNUSED) + { + continue; + } } auto pCbDst = &pInfo->pipeline.cbState.target[location]; diff --git a/icd/api/include/app_profile.h b/icd/api/include/app_profile.h index bf7757fe..0c2f8811 100644 --- a/icd/api/include/app_profile.h +++ b/icd/api/include/app_profile.h @@ -112,8 +112,6 @@ enum class AppProfile : uint32_t SniperElite5, // Sniper Elite 5 by Rebellion SeriousSamVrTheLastHope, // Serious Sam VR The Last Hope by Croteam BaldursGate3, // Baldur's Gate by Larian Studios - Enshrouded, // Enshrouded by Keen Games - HolisticEngine, // Holistic Engine by Keen Games #if VKI_RAY_TRACING ControlDX12, // VKD3D Control Ultimate Edition RayTracingWeekends, // RayTracingInVulkan demo diff --git a/icd/api/include/app_shader_optimizer.h b/icd/api/include/app_shader_optimizer.h index f2b848ca..f3a1fc7b 100644 --- a/icd/api/include/app_shader_optimizer.h +++ b/icd/api/include/app_shader_optimizer.h @@ -177,7 +177,7 @@ class ShaderOptimizer const PipelineOptimizerKey& pipelineKey, Pal::DynamicComputeShaderInfo* pDynamicComputeShaderInfo) const; - void ApplyProfileToDynamicGraphicsShaderInfo( + bool ApplyProfileToDynamicGraphicsShaderInfo( const ShaderProfileAction& action, Pal::DynamicGraphicsShaderInfo* pGraphicsShaderInfo) const; diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index ca4c3a64..2ad93eb0 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -86,6 +86,7 @@ struct LlpcShaderLibraryBlobHeader { uint32_t binaryLength; // Partial ELF binary length uint32_t fragMetaLength; // Fragment shader metadata length + bool requireFullPipeline; // Whether require full pipeline }; // ===================================================================================================================== // Pipeline Creation feedback info. @@ -175,6 +176,16 @@ static GraphicsLibraryType GetGraphicsLibraryType( return stage == ShaderStage::ShaderStageFragment ? GraphicsLibraryFragment : GraphicsLibraryPreRaster; } +// ===================================================================================================================== +static VkGraphicsPipelineLibraryFlagBitsEXT GetVkGraphicsLibraryFlagBit( + const ShaderStage stage) +{ + VK_ASSERT(stage < ShaderStage::ShaderStageGfxCount); + return stage == ShaderStage::ShaderStageFragment ? + VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT : + VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT; +} + // ===================================================================================================================== struct GraphicsPipelineBinaryCreateInfo { diff --git a/icd/api/include/defer_compile_thread.h b/icd/api/include/defer_compile_thread.h index 6c53624b..c26c742e 100644 --- a/icd/api/include/defer_compile_thread.h +++ b/icd/api/include/defer_compile_thread.h @@ -114,7 +114,7 @@ class DeferCompileThread final : public Util::Thread while (m_stop == false) { // Waits for new signal. - m_event.Wait(1.0f); + m_event.Wait(Util::fseconds{ 1.0f }); m_event.Reset(); DeferredCompileWorkload task; diff --git a/icd/api/include/graphics_pipeline_common.h b/icd/api/include/graphics_pipeline_common.h index 494fd761..0de05e0c 100644 --- a/icd/api/include/graphics_pipeline_common.h +++ b/icd/api/include/graphics_pipeline_common.h @@ -334,9 +334,9 @@ class GraphicsPipelineCommon : public Pipeline // Constructor of GraphicsPipelineCommon GraphicsPipelineCommon( #if VKI_RAY_TRACING - bool hasRayTracing, + bool hasRayTracing, #endif - Device* const pDevice) + Device* const pDevice) : Pipeline( pDevice, #if VKI_RAY_TRACING diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h index e2656eeb..6d09e280 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h @@ -69,7 +69,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 279 +#define VK_HEADER_VERSION 280 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION) @@ -1111,6 +1111,7 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_POOL_OVERALLOCATION_FEATURES_NV = 1000546000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAW_ACCESS_CHAINS_FEATURES_NV = 1000555000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT16_VECTOR_FEATURES_NV = 1000563000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_VALIDATION_FEATURES_NV = 1000568000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES, VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, @@ -19134,6 +19135,18 @@ typedef struct VkPhysicalDeviceShaderAtomicFloat16VectorFeaturesNV { +// VK_NV_ray_tracing_validation is a preprocessor guard. Do not pass it to API calls. +#define VK_NV_ray_tracing_validation 1 +#define VK_NV_RAY_TRACING_VALIDATION_SPEC_VERSION 1 +#define VK_NV_RAY_TRACING_VALIDATION_EXTENSION_NAME "VK_NV_ray_tracing_validation" +typedef struct VkPhysicalDeviceRayTracingValidationFeaturesNV { + VkStructureType sType; + void* pNext; + VkBool32 rayTracingValidation; +} VkPhysicalDeviceRayTracingValidationFeaturesNV; + + + // VK_KHR_acceleration_structure is a preprocessor guard. Do not pass it to API calls. #define VK_KHR_acceleration_structure 1 #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 13 diff --git a/icd/api/include/pipeline_binary_cache.h b/icd/api/include/pipeline_binary_cache.h index 4e18cdd8..a244f765 100644 --- a/icd/api/include/pipeline_binary_cache.h +++ b/icd/api/include/pipeline_binary_cache.h @@ -42,7 +42,7 @@ namespace Util class IPlatformKey; #if ICD_GPUOPEN_DEVMODE_BUILD -class DevModeMgr; +class IDevMode; #endif } // namespace Util @@ -64,7 +64,7 @@ class PipelineBinaryCache const vk::RuntimeSettings& settings, const char* pDefaultCacheFilePath, #if ICD_GPUOPEN_DEVMODE_BUILD - vk::DevModeMgr* pDevModeMgr, + vk::IDevMode* pDevMode, #endif uint32_t expectedEntries, size_t initDataSize, @@ -238,7 +238,7 @@ class PipelineBinaryCache Util::ICacheLayer* m_pTopLayer; // Top layer of the cache chain where queries are submitted #if ICD_GPUOPEN_DEVMODE_BUILD - vk::DevModeMgr* m_pDevModeMgr; + vk::IDevMode* m_pDevMode; Util::ICacheLayer* m_pReinjectionLayer; // Reinjection interface layer HashMapping m_hashMapping; // Maps the internalPipelineHash to the appropriate CacheId diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h index 1530c700..e10e5e34 100644 --- a/icd/api/include/pipeline_compiler.h +++ b/icd/api/include/pipeline_compiler.h @@ -115,6 +115,15 @@ struct RayTracingPipelineShaderStageInfo }; #endif +// ===================================================================================================================== +/// Determines whether the given stage info is from shader module identifier. +inline bool IsShaderModuleIdentifier(const Vkgc::PipelineShaderInfo& stageInfo) +{ + return (stageInfo.pModuleData == nullptr) && + ((stageInfo.options.clientHash.lower != 0) || + (stageInfo.options.clientHash.upper != 0)); +} + // ===================================================================================================================== class PipelineCompiler { @@ -458,6 +467,14 @@ class PipelineCompiler static void DumpPipelineMetadata( void* pPipelineDumpHandle, const PipelineMetadata* pBinaryMetadata); + + void DumpPipeline( + const RuntimeSettings& settings, + const Vkgc::PipelineBuildInfo& pipelineInfo, + uint64_t apiPsoHash, + uint32_t binaryCount, + const Vkgc::BinaryData* pElfBinary, + VkResult result); private: PAL_DISALLOW_COPY_AND_ASSIGN(PipelineCompiler); diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 63631471..192d051a 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -458,6 +458,14 @@ class CmdBuffer VkBuffer countBuffer, VkDeviceSize countOffset); + template< bool indexed, bool useBufferCount> + void DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + void DrawMeshTasks( uint32_t x, uint32_t y, @@ -472,6 +480,14 @@ class CmdBuffer VkBuffer countBuffer, VkDeviceSize countOffset); + template + void DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + void Dispatch( uint32_t x, uint32_t y, @@ -485,10 +501,17 @@ class CmdBuffer uint32_t dim_y, uint32_t dim_z); + void DispatchIndirect( + VkDeviceSize indirectBufferVa); + void DispatchIndirect( VkBuffer buffer, VkDeviceSize offset); + void ExecuteIndirect( + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pInfo); + template void CopyBuffer( VkBuffer srcBuffer, @@ -1428,6 +1451,11 @@ class CmdBuffer } #if VKI_RAY_TRACING + uint64 GetCpsMemSize() const { return m_maxCpsMemSize; } + + void ApplyPatchCpsRequests( + uint32_t deviceIdx, + const Pal::IGpuMemory& cpsMem) const; bool HasRayTracing() const { return m_flags.hasRayTracing; } #endif @@ -1460,11 +1488,10 @@ class CmdBuffer { return &m_debugPrintf; } + private: PAL_DISALLOW_COPY_AND_ASSIGN(CmdBuffer); - uint32 GetHevcDbpIndex(const uint8_t* pRefPicList, uint32 dpbSlot); - void ValidateGraphicsStates(); void ValidateSamplePattern(uint32_t sampleCount, SamplePattern* pSamplePattern); @@ -1823,6 +1850,7 @@ class CmdBuffer const RuntimeSettings& settings, CmdPool* pCmdPool, const RayTracingPipeline* pPipeline, + uint32* pConstMem, Pal::gpusize constGpuAddr, uint32_t width, uint32_t height, @@ -1851,7 +1879,15 @@ class CmdBuffer uint32_t height, uint32_t depth, Buffer* pIndirectBuffer, - VkDeviceSize indirectOffset); + VkDeviceSize indirectOffset, + const Pal::gpusize indirectBufferVa); + + void AddPatchCpsRequest( + uint32_t deviceIdx, + GpuRt::DispatchRaysConstants* pConstsMem, + uint64_t bufSize); + + void FreePatchCpsList(); #endif void InsertDebugMarker( @@ -1943,6 +1979,11 @@ class CmdBuffer bool m_reverseThreadGroupState; #if VKI_RAY_TRACING Util::Vector m_scratchVidMemList; // Ray-tracing scratch memory + + uint64 m_maxCpsMemSize; // max ray sorting memory requested + + typedef Util::Vector PatchCpsVector; + PatchCpsVector m_patchCpsList[MaxPalDevices]; #endif }; @@ -2234,6 +2275,26 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect( VkBuffer buffer, VkDeviceSize offset); +VKAPI_ATTR void VKAPI_CALL vkCmdPreprocessGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdExecuteGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdBindPipelineShaderGroupNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline, + uint32_t groupIndex); + +VKAPI_ATTR void VKAPI_CALL vkCmdUpdatePipelineIndirectBufferNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline); + VKAPI_ATTR void VKAPI_CALL vkCmdDispatchBase( VkCommandBuffer commandBuffer, uint32_t baseGroupX, diff --git a/icd/api/include/vk_compute_pipeline.h b/icd/api/include/vk_compute_pipeline.h index 8ea4e790..2e6e80b3 100644 --- a/icd/api/include/vk_compute_pipeline.h +++ b/icd/api/include/vk_compute_pipeline.h @@ -72,7 +72,7 @@ class ComputePipeline final : public Pipeline, public NonDispatchable(palShaderStageMask); +} + // ===================================================================================================================== struct UberFetchShaderFormatInfo { @@ -4021,6 +4073,26 @@ VkFormat GetLowPrecisionDepthFormat( const VkImageUsageFlags& imageUsage, const RuntimeSettings& settings); +const char* VkResultName(VkResult result); + +inline std::chrono::nanoseconds Uint64ToChronoNano(uint64_t nanoSeconds) +{ + const uint64_t maxNano = static_cast(std::chrono::nanoseconds::max().count()); + return std::chrono::nanoseconds { Util::Min(nanoSeconds, maxNano) }; +} + +inline std::chrono::milliseconds Uint64ToChronoMilli(uint64_t milliSeconds) +{ + const uint64_t maxMilli = static_cast(std::chrono::milliseconds::max().count()); + return std::chrono::milliseconds { Util::Min(milliSeconds, maxMilli) }; +} + +inline std::chrono::seconds Uint64ToChronoSeconds(uint64_t seconds) +{ + const uint64_t maxSeconds = static_cast(std::chrono::seconds::max().count()); + return std::chrono::seconds { Util::Min(seconds, maxSeconds) }; +} + } // namespace vk #endif /* __VK_CONV_H__ */ diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h index 654dd80e..7024ca43 100644 --- a/icd/api/include/vk_device.h +++ b/icd/api/include/vk_device.h @@ -166,8 +166,9 @@ class Device // True if EXT_PRIMITIVES_GENERATED_QUERY is enabled. uint32 primitivesGeneratedQuery : 1; uint32 reserved1 : 1; + uint32 reserved2 : 1; - uint32 reserved : 13; + uint32 reserved : 12; }; uint32 u32All; @@ -382,6 +383,11 @@ class Device const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapChain); + VkResult CreateIndirectCommandsLayout( + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout); + VkResult ImportSemaphore( VkSemaphore semaphore, const ImportSemaphoreInfo& importInfo); @@ -1422,6 +1428,22 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout2KHR( const VkImageSubresource2KHR* pSubresource, VkSubresourceLayout2KHR* pLayout); +VKAPI_ATTR VkResult VKAPI_CALL vkCreateIndirectCommandsLayoutNV( + VkDevice device, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout); + +VKAPI_ATTR void VKAPI_CALL vkDestroyIndirectCommandsLayoutNV( + VkDevice device, + VkIndirectCommandsLayoutNV indirectCommandsLayout, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR void VKAPI_CALL vkGetGeneratedCommandsMemoryRequirementsNV( + VkDevice device, + const VkGeneratedCommandsMemoryRequirementsInfoNV* pInfo, + VkMemoryRequirements2* pMemoryRequirements); + } // namespace entry } // namespace vk diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index 7cb5f137..c116faed 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -408,6 +408,7 @@ class DeviceExtensions final : public Extensions EXT_MEMORY_PRIORITY, EXT_MESH_SHADER, EXT_MUTABLE_DESCRIPTOR_TYPE, + EXT_NESTED_COMMAND_BUFFER, EXT_NON_SEAMLESS_CUBE_MAP, EXT_PAGEABLE_DEVICE_LOCAL_MEMORY, EXT_PCI_BUS_INFO, @@ -474,6 +475,9 @@ class DeviceExtensions final : public Extensions GOOGLE_USER_TYPE, NV_COMPUTE_SHADER_DERIVATIVES, + NV_DEVICE_GENERATED_COMMANDS, + NV_DEVICE_GENERATED_COMMANDS_COMPUTE, + VALVE_MUTABLE_DESCRIPTOR_TYPE, Count }; diff --git a/icd/api/include/vk_formats.h b/icd/api/include/vk_formats.h index f4b76ca6..7d3a2497 100755 --- a/icd/api/include/vk_formats.h +++ b/icd/api/include/vk_formats.h @@ -51,6 +51,8 @@ struct AstcMappedInfo }; #endif +class PhysicalDevice; + // ===================================================================================================================== // Container for storing compile-time meta-information about Vulkan formats. // @@ -82,6 +84,16 @@ struct Formats #endif static VkExtent3D ElementsToTexels(VkFormat format, const VkExtent3D& extent, const RuntimeSettings& settings); static Pal::Formats::NumericSupportFlags GetNumberFormat(VkFormat format, const RuntimeSettings& settings); + + static VkFormat GetCompatibleSinglePlaneFormat( + VkFormat multiPlaneFormat, + uint32_t planeIndex); + + static VkFormatFeatureFlags GetExtendedFeatureFlags( + const PhysicalDevice* pPhysicalDevice, + VkFormat format, + VkImageTiling tiling, + const RuntimeSettings& settings); }; #define VK_EXT_4444_FORMAT_START VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT @@ -511,7 +523,6 @@ bool Formats::IsDvec3Or4( return needsTwoLocations; } - } // namespace vk #endif /* __VK_FORMATS_H__ */ diff --git a/icd/api/include/vk_indirect_commands_layout.h b/icd/api/include/vk_indirect_commands_layout.h new file mode 100644 index 00000000..211c0d10 --- /dev/null +++ b/icd/api/include/vk_indirect_commands_layout.h @@ -0,0 +1,147 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file vk_indirect_commands_layout.h + * @brief Functionality related to Vulkan indirect commands layout objects. + *********************************************************************************************************************** + */ + +#ifndef __VK_INDIRECT_COMMANDS_LAYOUT_H__ +#define __VK_INDIRECT_COMMANDS_LAYOUT_H__ + +#pragma once + +#include "include/khronos/vulkan.h" +#include "include/vk_device.h" +#include "include/vk_dispatch.h" +#include "include/vk_pipeline_layout.h" + +#include "palIndirectCmdGenerator.h" + +namespace Pal +{ + +class IIndirectCmdGenerator; +struct IndirectCmdGeneratorCreateInfo; +struct IndirectParam; + +}; + +namespace vk +{ + +enum IndirectCommandsActionType +{ + Draw = 0, + DrawIndexed, + Dispatch, + MeshTask +}; + +struct IndirectCommandsInfo +{ + IndirectCommandsActionType actionType; +}; + + // ===================================================================================================================== + // API implementation of Vulkan indirect commands layout + // + // Indirect commands layout objects describe the information of indirect commands, as well as how to interpret and + // process indirect buffers. +class IndirectCommandsLayout final : public NonDispatchable +{ +public: + static VkResult Create( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pLayout); + + void CalculateMemoryRequirements( + const Device* pDevice, + VkMemoryRequirements2* pMemoryRequirements) const; + + void BindPreprocessBuffer( + VkBuffer buffer, + VkDeviceSize memOffset, + uint32_t deviceIdx); + + VkResult Destroy( + Device* pDevice, + const VkAllocationCallbacks* pAllocator); + + const Pal::IIndirectCmdGenerator* PalIndirectCmdGenerator(uint32_t idx) const + { + return m_perGpu[idx].pGenerator; + } + + IndirectCommandsInfo GetIndirectCommandsInfo() const + { + return m_info; + } + +private: + + PAL_DISALLOW_COPY_AND_ASSIGN(IndirectCommandsLayout); + + struct PerGpuInfo + { + Pal::IIndirectCmdGenerator* pGenerator; + Pal::gpusize preprocessBufferVirtAddr; + }; + + IndirectCommandsLayout( + const Device* pDevice, + const IndirectCommandsInfo& info, + Pal::IIndirectCmdGenerator** pGenerator, + const Pal::IndirectCmdGeneratorCreateInfo& palCreateInfo); + + static size_t ObjectSize(const Device* pDevice) + { + return sizeof(IndirectCommandsLayout) + ((pDevice->NumPalDevices() - 1) * sizeof(PerGpuInfo)); + } + + static void BuildPalCreateInfo( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + Pal::IndirectParam* pIndirectParams, + Pal::IndirectCmdGeneratorCreateInfo* pPalCreateInfo); + + IndirectCommandsInfo m_info; + Pal::IndirectCmdGeneratorCreateInfo m_palCreateInfo; + PerGpuInfo m_perGpu[1]; +}; + +// Max usage is the situation where indirect commands layout drains push constants size plus uses indirect index & vertex +// buffer binding and ends with a draw indexed. +constexpr uint32_t MaxIndirectTokenCount = MaxPushConstRegCount + 3; +constexpr uint32_t MaxIndirectTokenOffset = MaxPushConstants + + sizeof(VkBindIndexBufferIndirectCommandNV) + + sizeof(VkBindVertexBufferIndirectCommandNV) + + sizeof(VkDrawIndexedIndirectCommand); +} // namespace vk + +#endif /* __VK_INDIRECT_COMMANDS_LAYOUT_H__ */ diff --git a/icd/api/include/vk_instance.h b/icd/api/include/vk_instance.h index ce615435..f0771fa5 100644 --- a/icd/api/include/vk_instance.h +++ b/icd/api/include/vk_instance.h @@ -60,7 +60,7 @@ namespace vk { // Forward declare classes used in this file. -class DevModeMgr; +class IDevMode; class ApiInstance; class DisplayManager; class GpuMemoryEventHandler; @@ -231,8 +231,8 @@ class Instance Pal::NullGpuId GetNullGpuId() const { return m_nullGpuId; } - DevModeMgr* GetDevModeMgr() - { return m_pDevModeMgr; } + IDevMode* GetDevModeMgr() + { return m_pDevMode; } GpuMemoryEventHandler* GetGpuMemoryEventHandler() const { return m_pGpuMemoryEventHandler; } @@ -359,7 +359,7 @@ class Instance ScreenObject m_screens[Pal::MaxScreens]; void* m_pScreenStorage; - DevModeMgr* m_pDevModeMgr; // GPUOpen Developer Mode manager. + IDevMode* m_pDevMode; // GPUOpen Developer Mode manager. static const size_t APP_INFO_MAX_CHARS = 256; char m_applicationName[APP_INFO_MAX_CHARS]; diff --git a/icd/api/include/vk_physical_device_manager.h b/icd/api/include/vk_physical_device_manager.h index f92229fe..019c165a 100644 --- a/icd/api/include/vk_physical_device_manager.h +++ b/icd/api/include/vk_physical_device_manager.h @@ -50,6 +50,7 @@ namespace vk // Forward declare Vulkan classes used in this file. class Instance; class PhysicalDevice; +class ExperimentsLoader; class PhysicalDeviceManager { @@ -111,6 +112,7 @@ class PhysicalDeviceManager Util::Mutex m_devicesLock; // Mutex used to lock access to the vector of physical devices VkPhysicalDeviceProperties* m_pAllNullProperties; // Physical device properties exposed when NULL_GPU=ALL + ExperimentsLoader* m_pExperimentsLoader; }; } diff --git a/icd/api/include/vk_pipeline.h b/icd/api/include/vk_pipeline.h index 81e66e97..ab8a785e 100644 --- a/icd/api/include/vk_pipeline.h +++ b/icd/api/include/vk_pipeline.h @@ -245,11 +245,11 @@ class Pipeline protected: Pipeline( - Device* const pDevice, + Device* const pDevice, #if VKI_RAY_TRACING - bool hasRayTracing, + bool hasRayTracing, #endif - VkPipelineBindPoint type); + VkPipelineBindPoint type); void Init( Pal::IPipeline** pPalPipeline, @@ -347,6 +347,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineExecutableInternalRepresentationsKHR uint32_t* pInternalRepresentationCount, VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations); +VKAPI_ATTR VkDeviceAddress VKAPI_CALL vkGetPipelineIndirectDeviceAddressNV( + VkDevice device, + const VkPipelineIndirectDeviceAddressInfoNV* pInfo); + +VKAPI_ATTR void VKAPI_CALL vkGetPipelineIndirectMemoryRequirementsNV( + VkDevice device, + const VkComputePipelineCreateInfo* pCreateInfo, + VkMemoryRequirements2* pMemoryRequirements); }; } // namespace vk diff --git a/icd/api/include/vk_queue.h b/icd/api/include/vk_queue.h index dbc9453c..ac3f5215 100644 --- a/icd/api/include/vk_queue.h +++ b/icd/api/include/vk_queue.h @@ -40,6 +40,7 @@ #include "include/vk_instance.h" #include "include/vk_utils.h" #include "include/virtual_stack_mgr.h" +#include "include/internal_mem_mgr.h" #include "palQueue.h" @@ -56,7 +57,7 @@ namespace vk struct CmdBufState; class CmdBufferRing; class Device; -class DevModeMgr; +class IDevMode; class ApiQueue; class Instance; class SwapChain; @@ -66,6 +67,15 @@ class SqttQueueState; class PhysicalDevice; class Memory; +#if VKI_RAY_TRACING +// Memory tracker for CPS stack memory to be freed +struct CpsMemTracker +{ + InternalMemory* pMem; + Pal::IFence* pFence; +}; +#endif + // ===================================================================================================================== // A Vulkan queue. class Queue @@ -255,10 +265,6 @@ class Queue const Pal::CmdBufInfo& cmdBufInfo, CmdBufState* pCmdBufState); - VkResult SynchronizeBackBuffer( - Memory* pMemory, - uint32_t deviceIdx); - protected: // This is a helper structure during a virtual remap (sparse bind) call to batch remaps into // as few calls as possible. @@ -353,9 +359,17 @@ class Queue const Pal::PresentSwapChainInfo* pPresentInfo); void DevModeFrameBoundary( - DevModeMgr* pDevModeMgr, + IDevMode* pDevMode, const VkFrameBoundaryEXT* pFrameBoundaryInfo); +#if VKI_RAY_TRACING + void FreeRetiredCpsStackMem(); + + Pal::IFence* GetCpsStackMem( + uint32_t deviceIdx, + uint64_t size); +#endif + Pal::IQueue* m_pPalQueues[MaxPalDevices]; Pal::IQueue* m_pPalBackupQueues[MaxPalDevices]; Pal::IQueue* m_pPalBackupTmzQueues[MaxPalDevices]; @@ -369,7 +383,7 @@ class Queue uint32_t m_queueFamilyIndex; // This queue's family index uint32_t m_queueIndex; // This queue's index within the node group uint32_t m_queueFlags; - DevModeMgr* m_pDevModeMgr; + IDevMode* m_pDevMode; VirtualStackAllocator* m_pStackAllocator; VidPnSourceFlipStatus m_flipStatus; Pal::PerSourceFrameMetadataControl m_palFrameMetadataControl; @@ -379,6 +393,15 @@ class Queue const bool m_isDeviceIndependent; +#if VKI_RAY_TRACING + InternalMemory* m_pCpsGlobalMem; + + typedef Util::List CpsMemDestroyList; + typedef Util::ListIterator CpsMemDestroyListIterator; + + CpsMemDestroyList m_cpsMemDestroyList; // list of cps stack memory to be destroyed +#endif + private: PAL_DISALLOW_COPY_AND_ASSIGN(Queue); }; diff --git a/icd/api/include/vk_swapchain.h b/icd/api/include/vk_swapchain.h index f845f7dd..f04ae65c 100644 --- a/icd/api/include/vk_swapchain.h +++ b/icd/api/include/vk_swapchain.h @@ -154,6 +154,8 @@ class SwapChain final : public NonDispatchable const Pal::ScreenColorConfig& GetColorParams() const { return m_colorParams; } + bool IsFullscreenOrEfsePresent() const; + Pal::IGpuMemory* UpdatePresentInfo( uint32_t deviceIdx, uint32_t imageIndex, @@ -187,9 +189,9 @@ class SwapChain final : public NonDispatchable void MarkAsDeprecated( const VkAllocationCallbacks* pAllocator); - bool IsDxgiEnabled() const + uint32_t GetVidPnSourceId() const { - return (m_properties.displayableInfo.palPlatform == Pal::WsiPlatform::Dxgi); + return m_vidPnSourceId; } bool IsSuboptimal(uint32_t deviceIdx); @@ -200,6 +202,7 @@ class SwapChain final : public NonDispatchable const Properties& properties, VkPresentModeKHR presentMode, FullscreenMgr* pFullscreenMgr, + uint32_t m_vidPnSourceId, Pal::WorkstationStereoMode wsStereoMode, Pal::ISwapChain* pPalSwapChain); @@ -224,7 +227,9 @@ class SwapChain final : public NonDispatchable uint32_t m_queueFamilyIndex; // Queue family index of the last present - Pal::WorkstationStereoMode m_wsStereoMode; // Workstation Stereo Mode + uint32_t m_vidPnSourceId; // Video present source identifier. + + Pal::WorkstationStereoMode m_wsStereoMode; // Workstation Stereo Mode Device::InternalPipeline m_pAutoStereoPipeline; // Auto Stereo shader private: @@ -263,8 +268,7 @@ class FullscreenMgr FullscreenMgr::Mode mode, Pal::IScreen* pScreen, Pal::OsDisplayHandle hDisplay, - Pal::OsWindowHandle hWindow, - uint32_t vidPnSourceId); + Pal::OsWindowHandle hWindow); ~FullscreenMgr(); @@ -283,19 +287,11 @@ class FullscreenMgr void Destroy(const VkAllocationCallbacks* pAllocator); - void UpdatePresentInfo( - SwapChain* pSwapChain, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags); - Pal::Result IsFullscreenOwnershipSafe() const; ExclusiveModeFlags GetExclusiveModeFlags() const { return m_exclusiveModeFlags; } - uint32_t GetVidPnSourceId() const - { return m_vidPnSourceId; } - Pal::IScreen* GetPalScreen() const { return m_pScreen; } @@ -323,8 +319,7 @@ class FullscreenMgr Pal::OsDisplayHandle m_hDisplay; // The monitor of the IScreen from swap chain creation Pal::OsWindowHandle m_hWindow; // The window of the swap chain - uint32_t m_vidPnSourceId; // Video present source identifier - Mode m_mode; // Indicates the Presentation mode we are using + Mode m_mode; // Indicates the Presentation mode we are using }; // ===================================================================================================================== diff --git a/icd/api/include/vk_utils.h b/icd/api/include/vk_utils.h index 8c048313..cda14997 100644 --- a/icd/api/include/vk_utils.h +++ b/icd/api/include/vk_utils.h @@ -143,14 +143,12 @@ inline uint64_t TicksToNano(uint64_t ticks) // Get driver build time hash uint32_t GetBuildTimeHash(); -#if DEBUG // ===================================================================================================================== // If turned on and exe name is a match, this function spins idle until we have a debugger hooked. void WaitIdleForDebugger(bool waitIdleToggled, const char* pWaitIdleExeName, uint32_t debugTimeout); -#endif // ===================================================================================================================== -// This function can be used to get the right externsion structure of specific type in case there are more than one +// This function can be used to get the right extension structure of specific type in case there are more than one // extension is supported inline const VkStructHeader* GetExtensionStructure(const VkStructHeader* pHeader, VkStructureType sType) { diff --git a/icd/api/pipeline_binary_cache.cpp b/icd/api/pipeline_binary_cache.cpp index 8803ae9c..b157a3b2 100644 --- a/icd/api/pipeline_binary_cache.cpp +++ b/icd/api/pipeline_binary_cache.cpp @@ -109,7 +109,7 @@ PipelineBinaryCache* PipelineBinaryCache::Create( const RuntimeSettings& settings, const char* pDefaultCacheFilePath, #if ICD_GPUOPEN_DEVMODE_BUILD - vk::DevModeMgr* pDevModeMgr, + vk::IDevMode* pDevMode, #endif uint32_t expectedEntries, size_t initDataSize, @@ -129,7 +129,7 @@ PipelineBinaryCache* PipelineBinaryCache::Create( pObj = VK_PLACEMENT_NEW(pMem) PipelineBinaryCache(pAllocationCallbacks, gfxIp, expectedEntries); #if ICD_GPUOPEN_DEVMODE_BUILD - pObj->m_pDevModeMgr = pDevModeMgr; + pObj->m_pDevMode = pDevMode; #endif if (pObj->Initialize(settings, createArchiveLayers, pDefaultCacheFilePath, pKey) != VK_SUCCESS) @@ -193,7 +193,7 @@ PipelineBinaryCache::PipelineBinaryCache( m_pPlatformKey { nullptr }, m_pTopLayer { nullptr }, #if ICD_GPUOPEN_DEVMODE_BUILD - m_pDevModeMgr { nullptr }, + m_pDevMode { nullptr }, m_pReinjectionLayer { nullptr }, m_hashMapping { 32, &m_palAllocator }, #endif @@ -530,9 +530,9 @@ void PipelineBinaryCache::FreePipelineBinary( void PipelineBinaryCache::Destroy() { #if ICD_GPUOPEN_DEVMODE_BUILD - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_pDevModeMgr->DeregisterPipelineCache(this); + m_pDevMode->DeregisterPipelineCache(this); } #endif @@ -575,7 +575,7 @@ VkResult PipelineBinaryCache::Initialize( if ((result == VK_SUCCESS) && (m_pReinjectionLayer != nullptr)) { - Util::Result palResult = m_pDevModeMgr->RegisterPipelineCache( + Util::Result palResult = m_pDevMode->RegisterPipelineCache( this, settings.devModePipelineUriServicePostSizeLimit); @@ -612,7 +612,7 @@ VkResult PipelineBinaryCache::InitReinjectionLayer( { VkResult result = VK_ERROR_FEATURE_NOT_PRESENT; - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { Util::MemoryCacheCreateInfo info = {}; Util::AllocCallbacks allocCbs = { @@ -1080,13 +1080,17 @@ VkResult PipelineBinaryCache::InitArchiveLayers( if (settings.allowCleanUpCacheDirectory) { - uint64 totalSize = 0, oldestTime = 0; + uint64 totalSize = 0; + Util::SecondsSinceEpoch oldestTime = { }; if (Util::GetStatusOfDir(pCachePath, &totalSize, &oldestTime) == Util::Result::Success) { if (totalSize >= settings.pipelineCacheDefaultLocationLimitation) { - Util::RemoveFilesOfDirOlderThan(pCachePath, - oldestTime + settings.thresholdOfCleanUpCache); + const uint64 sec = oldestTime.time_since_epoch().count() + + settings.thresholdOfCleanUpCache; + + Util::RemoveFilesOfDirOlderThan( + pCachePath, Util::SecondsSinceEpoch { Uint64ToChronoSeconds(sec) }); } } } diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp index 23dc846f..99083a4b 100644 --- a/icd/api/pipeline_compiler.cpp +++ b/icd/api/pipeline_compiler.cpp @@ -1110,15 +1110,6 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( } } - if (shouldCompile) - { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) - { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; - } - } - if (settings.enablePipelineDump && (result == VK_SUCCESS)) { Vkgc::PipelineDumpOptions dumpOptions = {}; @@ -1181,6 +1172,9 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( DumpPipelineMetadata(pPipelineDumpHandle, pCreateInfo->pBinaryMetadata); } + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -1244,6 +1238,9 @@ VkResult PipelineCompiler::CreateGraphicsShaderBinary( if (pPipelineDumpHandle != nullptr) { + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -1299,12 +1296,19 @@ VkResult PipelineCompiler::CreateColorExportShaderLibrary( Vkgc::PipelineBuildInfo pipelineInfo = {}; GraphicsPipelineBuildInfo graphicsInfo = pCreateInfo->pipelineInfo; graphicsInfo.task.pModuleData = nullptr; + graphicsInfo.task.options.clientHash = {}; graphicsInfo.vs.pModuleData = nullptr; + graphicsInfo.vs.options.clientHash = {}; graphicsInfo.tcs.pModuleData = nullptr; + graphicsInfo.tcs.options.clientHash = {}; graphicsInfo.tes.pModuleData = nullptr; + graphicsInfo.tes.options.clientHash = {}; graphicsInfo.gs.pModuleData = nullptr; + graphicsInfo.gs.options.clientHash = {}; graphicsInfo.mesh.pModuleData = nullptr; + graphicsInfo.mesh.options.clientHash = {}; graphicsInfo.fs.pModuleData = nullptr; + graphicsInfo.fs.options.clientHash = {}; pipelineInfo.pGraphicsInfo = &graphicsInfo; pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, @@ -1364,6 +1368,9 @@ VkResult PipelineCompiler::CreateColorExportShaderLibrary( if (pPipelineDumpHandle != nullptr) { + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } pCreateInfo->pipelineInfo.unlinked = false; @@ -1477,14 +1484,6 @@ VkResult PipelineCompiler::CreateComputePipelineBinary( } } } - if (shouldCompile) - { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) - { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; - } - } if (settings.enablePipelineDump && (result == VK_SUCCESS)) { @@ -1540,6 +1539,9 @@ VkResult PipelineCompiler::CreateComputePipelineBinary( { Vkgc::IPipelineDumper::DumpPipelineBinary(pPipelineDumpHandle, m_gfxIp, pPipelineBinary); } + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -2090,18 +2092,6 @@ static void BuildMultisampleStateInFoi( } } -// ===================================================================================================================== -static void BuildViewportState( - const Device* pDevice, - const VkPipelineViewportStateCreateInfo* pVs, - const uint64_t dynamicStateFlags, - GraphicsPipelineBinaryCreateInfo* pCreateInfo) -{ - if (pVs != nullptr) - { - } -} - // ===================================================================================================================== void PipelineCompiler::BuildNggState( const Device* pDevice, @@ -2228,18 +2218,10 @@ void PipelineCompiler::BuildPipelineShaderInfo( pCompiler->ApplyDefaultShaderOptions(stage, pShaderInfoIn->flags, &pShaderInfoOut->options); - if (pShaderInfoIn->pModuleHandle != nullptr) - { - Pal::ShaderHash clientHash = ShaderModule::GetCodeHash( - pShaderInfoIn->pModuleHandle->codeHash, pShaderInfoIn->pEntryPoint); - pShaderInfoOut->options.clientHash.lower = clientHash.lower; - pShaderInfoOut->options.clientHash.upper = clientHash.upper; - } - else - { - pShaderInfoOut->options.clientHash.lower = pShaderInfoIn->codeHash.lower; - pShaderInfoOut->options.clientHash.upper = pShaderInfoIn->codeHash.upper; - } + + pShaderInfoOut->options.clientHash.lower = pShaderInfoIn->codeHash.lower; + pShaderInfoOut->options.clientHash.upper = pShaderInfoIn->codeHash.upper; + ApplyProfileOptions(pDevice, static_cast(stage), pPipelineOptions, @@ -2441,11 +2423,15 @@ static void BuildColorBlendState( { uint32_t location = i; - if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i] != VK_ATTACHMENT_UNUSED)) + if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && + (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr)) { location = extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i]; + + if (location == VK_ATTACHMENT_UNUSED) + { + continue; + } } auto pLlpcCbDst = &pCreateInfo->pipelineInfo.cbState.target[location]; @@ -2624,11 +2610,6 @@ static void BuildPreRasterizationShaderState( BuildRasterizationState(pIn->pRasterizationState, dynamicStateFlags, &isConservativeOverestimation, pCreateInfo); - if (pCreateInfo->pipelineInfo.rsState.rasterizerDiscardEnable == false) - { - BuildViewportState(pDevice, pIn->pViewportState, dynamicStateFlags, pCreateInfo); - } - PipelineCompiler::BuildNggState( pDevice, activeStages, isConservativeOverestimation, unrestrictedPrimitiveTopology, pCreateInfo); @@ -3681,7 +3662,8 @@ void PipelineCompiler::FreeGraphicsPipelineCreateInfo( pCreateInfo->pTempBuffer = nullptr; } - if (pCreateInfo->pBinaryMetadata->internalBufferInfo.pData != nullptr) + if ((pCreateInfo->pBinaryMetadata != nullptr) && + (pCreateInfo->pBinaryMetadata->internalBufferInfo.pData != nullptr)) { pInstance->FreeMem(pCreateInfo->pBinaryMetadata->internalBufferInfo.pData); pCreateInfo->pBinaryMetadata->internalBufferInfo.pData = nullptr; @@ -4074,15 +4056,6 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary( bool shaderModuleReplaced = false; - if (shouldCompile) - { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) - { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; - } - } - if (settings.enablePipelineDump && (result == VK_SUCCESS)) { Vkgc::PipelineDumpOptions dumpOptions = {}; @@ -4236,6 +4209,9 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary( } } + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -5599,6 +5575,48 @@ void PipelineCompiler::DumpPipelineMetadata( } } +// ===================================================================================================================== +void PipelineCompiler::DumpPipeline( + const RuntimeSettings& settings, + const Vkgc::PipelineBuildInfo& pipelineInfo, + uint64_t apiPsoHash, + uint32_t binaryCount, + const Vkgc::BinaryData* pElfBinaries, + VkResult result) +{ + Vkgc::PipelineDumpOptions dumpOptions = {}; + dumpOptions.pDumpDir = settings.pipelineDumpDir; + dumpOptions.filterPipelineDumpByType = settings.filterPipelineDumpByType; + dumpOptions.filterPipelineDumpByHash = settings.filterPipelineDumpByHash; + dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; + + void* pPipelineDumpHandle = nullptr; + if (settings.dumpPipelineWithApiHash) + { + pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump( + &dumpOptions, pipelineInfo, apiPsoHash); + } + else + { + pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump( + &dumpOptions, pipelineInfo); + } + + for (uint32_t i = 0; i < binaryCount; i++) + { + if (pElfBinaries[i].codeSize > 0 && pElfBinaries[i].pCode != nullptr) + { + Vkgc::IPipelineDumper::DumpPipelineBinary( + pPipelineDumpHandle, m_gfxIp, &pElfBinaries[i]); + } + } + + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); + Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); +} + // ===================================================================================================================== // Template instantiation needed for references in other files. Linux complains if we don't do this. diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp index e9a47178..17b93902 100644 --- a/icd/api/raytrace/ray_tracing_device.cpp +++ b/icd/api/raytrace/ray_tracing_device.cpp @@ -182,6 +182,7 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->fp16BoxModeMixedSaThresh = Util::Clamp(fp16BoxMixedThreshold, 1.0f, 8.0f); pDeviceSettings->enableMortonCode30 = settings.rtEnableMortonCode30; pDeviceSettings->enableVariableBitsMortonCodes = settings.enableVariableBitsMortonCodes; + pDeviceSettings->enableFastLBVH = settings.rtEnableFastLbvh; pDeviceSettings->enablePrefixScanDLB = settings.rtEnablePrefixScanDlb; switch (settings.rtTriangleCompressionMode) @@ -214,8 +215,12 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->bvhCpuBuildModeFastTrace = static_cast(settings.rtBvhCpuBuildMode); pDeviceSettings->bvhCpuBuildModeDefault = static_cast(settings.rtBvhCpuBuildMode); pDeviceSettings->bvhCpuBuildModeFastBuild = static_cast(settings.rtBvhCpuBuildMode); + pDeviceSettings->enableTriangleSplitting = settings.rtEnableTriangleSplitting; pDeviceSettings->triangleSplittingFactor = settings.rtTriangleSplittingFactor; + pDeviceSettings->tsBudgetPerTriangle = settings.rtTriangleSplittingBudgetPerTriangle; + pDeviceSettings->tsPriority = settings.rtTriangleSplittingPriority; + pDeviceSettings->enableFusedInstanceNode = settings.enableFusedInstanceNode; pDeviceSettings->rebraidFactor = settings.rebraidFactor; pDeviceSettings->rebraidLengthPercentage = settings.rebraidLengthPercentage; @@ -232,9 +237,6 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->numMortonSizeBits = settings.numMortonSizeBits; pDeviceSettings->allowFp16BoxNodesInUpdatableBvh = settings.rtAllowFp16BoxNodesInUpdatableBvh; - pDeviceSettings->enableBuildAccelStructScratchDumping = pDeviceSettings->enableBuildAccelStructDumping && - settings.rtEnableAccelerationStructureScratchMemoryDump; - // Enable AS stats based on panel setting pDeviceSettings->enableBuildAccelStructStats = settings.rtEnableBuildAccelStructStats; // Number of Rebraid Iterations and rebraid Quality Heuristics @@ -305,9 +307,10 @@ bool RayTracingDevice::AccelStructTrackerEnabled( uint32_t deviceIdx ) const { - return (GetAccelStructTracker(deviceIdx) != nullptr) && - (m_pDevice->GetRuntimeSettings().enableTraceRayAccelStructTracking || - m_pGpuRtDevice[deviceIdx]->AccelStructTraceEnabled()); + + // Enable tracking when forced on in the panel or the GPURT trace source is enabled. + return ((GetAccelStructTracker(deviceIdx) != nullptr) && ( + m_pGpuRtDevice[deviceIdx]->AccelStructTraceEnabled())); } // ===================================================================================================================== @@ -1012,7 +1015,7 @@ Pal::Result RayTracingDevice::ClientFlushCmdContext( if (result == Pal::Result::Success) { - result = pCmdContext->pDevice->WaitForFences(1, &pCmdContext->pFence, true, UINT64_MAX); + result = pCmdContext->pDevice->WaitForFences(1, &pCmdContext->pFence, true, std::chrono::nanoseconds::max()); } return result; @@ -1128,15 +1131,13 @@ Pal::Result RayTracingDevice::ClientGetTemporaryGpuMemory( VK_ASSERT(pCmdbuf != nullptr); vk::Device* pDevice = pCmdbuf->VkDevice(); - for (uint32_t deviceIdx = 0; - pDevice->NumPalDevices(); - ++deviceIdx) + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); ++deviceIdx) { if (pCmdbuf->PalCmdBuffer(deviceIdx) != pPalCmdbuf) continue; InternalMemory* pVidMem = nullptr; - if (pCmdbuf->GetScratchVidMem(sizeInBytes, InternalPoolGpuReadOnlyCpuVisible, &pVidMem) == VK_SUCCESS) + if (pCmdbuf->GetScratchVidMem(sizeInBytes, InternalPoolDescriptorTable, &pVidMem) == VK_SUCCESS) { if (pVidMem != nullptr) { diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp index a7425fad..d5ab2665 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp @@ -613,7 +613,7 @@ VkResult RayTracingPipeline::CreateImpl( const uint32_t totalGroupCount = pipelineCreateInfo.groupCount + pipelineLibGroupCount; - RayTracingPipelineBinary pipelineBinary[MaxPalDevices] = {}; + RayTracingPipelineBinary pipelineBinaries[MaxPalDevices] = {}; Vkgc::RayTracingShaderIdentifier* pShaderGroups [MaxPalDevices] = {}; BinaryData librarySummaries[MaxPalDevices] = {}; @@ -636,8 +636,8 @@ VkResult RayTracingPipeline::CreateImpl( { if (pipelineCreateInfo.groupCount > 0) { - pipelineBinary[0].shaderGroupHandle.shaderHandles = pShaderGroups[0]; - pipelineBinary[0].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; + pipelineBinaries[0].shaderGroupHandle.shaderHandles = pShaderGroups[0]; + pipelineBinaries[0].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; } for (uint32_t deviceIdx = 1; deviceIdx < m_pDevice->NumPalDevices(); ++deviceIdx) @@ -645,8 +645,8 @@ VkResult RayTracingPipeline::CreateImpl( pShaderGroups[deviceIdx] = pShaderGroups[deviceIdx - 1] + totalGroupCount; if (pipelineCreateInfo.groupCount > 0) { - pipelineBinary[deviceIdx].shaderGroupHandle.shaderHandles = pShaderGroups[deviceIdx]; - pipelineBinary[deviceIdx].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; + pipelineBinaries[deviceIdx].shaderGroupHandle.shaderHandles = pShaderGroups[deviceIdx]; + pipelineBinaries[deviceIdx].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; } } } @@ -668,11 +668,11 @@ VkResult RayTracingPipeline::CreateImpl( nullptr, utils::PlacementElement{ - &pipelineBinary[0].shaderPropSet.shaderProps, + &pipelineBinaries[0].shaderPropSet.shaderProps, maxFunctionCount * m_pDevice->NumPalDevices()}, utils::PlacementElement{ - &pipelineBinary[0].pPipelineBins, + &pipelineBinaries[0].pPipelineBins, maxPipelineBinaryCount * m_pDevice->NumPalDevices()}, utils::PlacementElement{&pIndirectFuncInfo, maxFunctionCount}, @@ -691,13 +691,13 @@ VkResult RayTracingPipeline::CreateImpl( memset(pTempBuffer, 0, placement.SizeOf()); placement.FixupPtrs(pTempBuffer); - pipelineBinary[0].shaderPropSet.shaderCount = maxFunctionCount; - pipelineBinary[0].pipelineBinCount = maxPipelineBinaryCount; + pipelineBinaries[0].shaderPropSet.shaderCount = maxFunctionCount; + pipelineBinaries[0].pipelineBinCount = maxPipelineBinaryCount; for (uint32_t deviceIdx = 1; deviceIdx < m_pDevice->NumPalDevices(); ++deviceIdx) { - const auto pBinary = &pipelineBinary[deviceIdx]; - const auto& prevBinary = pipelineBinary[deviceIdx - 1]; + const auto pBinary = &pipelineBinaries[deviceIdx]; + const auto& prevBinary = pipelineBinaries[deviceIdx - 1]; pBinary->pipelineBinCount = maxPipelineBinaryCount; pBinary->pPipelineBins = prevBinary.pPipelineBins + maxPipelineBinaryCount; @@ -733,7 +733,7 @@ VkResult RayTracingPipeline::CreateImpl( &cacheId[deviceIdx] ); - bool forceCompilation = m_pDevice->GetRuntimeSettings().enablePipelineDump; + bool forceCompilation = false; if (forceCompilation == false) { Vkgc::BinaryData cachedBinData = {}; @@ -761,31 +761,75 @@ VkResult RayTracingPipeline::CreateImpl( // Unpack the cached blob into separate binaries. pDefaultCompiler->ExtractRayTracingPipelineBinary( &cachedBinData, - &pipelineBinary[deviceIdx]); + &pipelineBinaries[deviceIdx]); } } - // Compile if unable to retrieve from cache. if (cacheResult != Util::Result::Success) { - result = pDefaultCompiler->ConvertRayTracingPipelineInfo( + if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && + (flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)) + { + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + } + + bool shouldConvert = (pCreateInfo != nullptr) && + (settings.enablePipelineDump || (cacheResult != Util::Result::Success)); + + VkResult convertResult = VK_ERROR_UNKNOWN; + if (shouldConvert) + { + convertResult = pDefaultCompiler->ConvertRayTracingPipelineInfo( m_pDevice, &pipelineCreateInfo, flags, &shaderInfo, &optimizerKey, &binaryCreateInfo); + result = (result == VK_SUCCESS) ? convertResult : result; + } - if (result == VK_SUCCESS) + if ((result == VK_SUCCESS) && + (convertResult == VK_SUCCESS) && + (cacheResult != Util::Result::Success)) + { + for (uint32_t i = 0; i < binaryCreateInfo.pipelineInfo.shaderCount; i++) { - result = pDefaultCompiler->CreateRayTracingPipelineBinary( - m_pDevice, - deviceIdx, - pPipelineCache, - &binaryCreateInfo, - &pipelineBinary[deviceIdx], - &cacheId[deviceIdx]); + if (IsShaderModuleIdentifier(binaryCreateInfo.pipelineInfo.pShaders[i])) + { + result = VK_ERROR_UNKNOWN; + break; + } + } + } + + if (settings.enablePipelineDump && (convertResult == VK_SUCCESS)) + { + if ((cacheResult == Util::Result::Success) || (result != VK_SUCCESS)) + { + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pipelineInfo.pRayTracingInfo = &binaryCreateInfo.pipelineInfo; + pDefaultCompiler->DumpPipeline( + m_pDevice->GetRuntimeSettings(), + pipelineInfo, + binaryCreateInfo.apiPsoHash, + pipelineBinaries[deviceIdx].pipelineBinCount, + pipelineBinaries[deviceIdx].pPipelineBins, + result); } + } + + // Compile if unable to retrieve from cache. + if ((result == VK_SUCCESS) && (cacheResult != Util::Result::Success)) + { + result = pDefaultCompiler->CreateRayTracingPipelineBinary( + m_pDevice, + deviceIdx, + pPipelineCache, + &binaryCreateInfo, + &pipelineBinaries[deviceIdx], + &cacheId[deviceIdx]); // Add the pipeline to any cache layer where it's missing. if (result == VK_SUCCESS) @@ -794,7 +838,7 @@ VkResult RayTracingPipeline::CreateImpl( // Join the binaries into a single blob. pDefaultCompiler->BuildRayTracingPipelineBinary( - &pipelineBinary[deviceIdx], + &pipelineBinaries[deviceIdx], &cachedBinData); if (cachedBinData.pCode != nullptr) @@ -814,7 +858,7 @@ VkResult RayTracingPipeline::CreateImpl( if (totalGroupCount > 0) { // Copy shader groups if compiler doesn't use pre-allocated buffer. - const auto& groupHandle = pipelineBinary[deviceIdx].shaderGroupHandle; + const auto& groupHandle = pipelineBinaries[deviceIdx].shaderGroupHandle; if (groupHandle.shaderHandles != pShaderGroups[deviceIdx]) { memcpy( @@ -825,13 +869,13 @@ VkResult RayTracingPipeline::CreateImpl( } } - m_hasTraceRay = pipelineBinary[DefaultDeviceIndex].hasTraceRay; + m_hasTraceRay = pipelineBinaries[DefaultDeviceIndex].hasTraceRay; uint32_t funcCount = 0; if (result == VK_SUCCESS) { - const auto pShaderProp = &pipelineBinary[DefaultDeviceIndex].shaderPropSet.shaderProps[0]; - const uint32_t shaderCount = pipelineBinary[DefaultDeviceIndex].shaderPropSet.shaderCount; + const auto pShaderProp = &pipelineBinaries[DefaultDeviceIndex].shaderPropSet.shaderProps[0]; + const uint32_t shaderCount = pipelineBinaries[DefaultDeviceIndex].shaderPropSet.shaderCount; for (uint32_t i = 0; i < shaderCount; i++) { if (pShaderProp[i].shaderId != RayTracingInvalidShaderId) @@ -854,7 +898,7 @@ VkResult RayTracingPipeline::CreateImpl( for (uint32_t deviceIdx = 0; deviceIdx != MaxPalDevices; ++deviceIdx) { - const auto& librarySummary = pipelineBinary[deviceIdx].librarySummary; + const auto& librarySummary = pipelineBinaries[deviceIdx].librarySummary; totalLibrarySummariesSize += Pow2Align(librarySummary.codeSize, 8); } @@ -875,7 +919,7 @@ VkResult RayTracingPipeline::CreateImpl( for (uint32_t deviceIdx = 0; deviceIdx != MaxPalDevices; ++deviceIdx) { - const auto& librarySummary = pipelineBinary[deviceIdx].librarySummary; + const auto& librarySummary = pipelineBinaries[deviceIdx].librarySummary; librarySummaries[deviceIdx].pCode = VoidPtrInc(pBuffer, offset); librarySummaries[deviceIdx].codeSize = librarySummary.codeSize; memcpy(VoidPtrInc(pBuffer, offset), librarySummary.pCode, librarySummary.codeSize); @@ -969,7 +1013,7 @@ VkResult RayTracingPipeline::CreateImpl( ((deviceIdx < m_pDevice->NumPalDevices()) && (palResult == Pal::Result::Success)); deviceIdx++) { - const auto pBinaries = pipelineBinary[deviceIdx].pPipelineBins; + const auto pBinaries = pipelineBinaries[deviceIdx].pPipelineBins; const auto ppDeviceShaderLibraries = ppShaderLibraries + deviceIdx * funcCount; void* pDeviceShaderLibraryMem = Util::VoidPtrInc(pPalShaderLibraryMem, deviceIdx * funcCount * shaderLibrarySize); @@ -984,14 +1028,14 @@ VkResult RayTracingPipeline::CreateImpl( localPipelineInfo.pipeline.flags.clientInternal = false; localPipelineInfo.pipeline.pipelineBinarySize = pBinaries[0].codeSize; localPipelineInfo.pipeline.pPipelineBinary = pBinaries[0].pCode; - localPipelineInfo.pipeline.maxFunctionCallDepth = pipelineBinary[deviceIdx].maxFunctionCallDepth; + localPipelineInfo.pipeline.maxFunctionCallDepth = pipelineBinaries[deviceIdx].maxFunctionCallDepth; } // Copy indirect function info uint32_t funcIndex = 0; - const auto pShaderProp = &pipelineBinary[deviceIdx].shaderPropSet.shaderProps[0]; - const uint32_t traceRayShaderIndex = pipelineBinary[deviceIdx].shaderPropSet.traceRayIndex; - const uint32_t shaderCount = pipelineBinary[deviceIdx].shaderPropSet.shaderCount; + const auto pShaderProp = &pipelineBinaries[deviceIdx].shaderPropSet.shaderProps[0]; + const uint32_t traceRayShaderIndex = pipelineBinaries[deviceIdx].shaderPropSet.traceRayIndex; + const uint32_t shaderCount = pipelineBinaries[deviceIdx].shaderPropSet.shaderCount; for (uint32_t i = 0; i < shaderCount; i++) { @@ -1251,10 +1295,10 @@ VkResult RayTracingPipeline::CreateImpl( const auto pPipelineLibShaderGroups = pPipelineLib->GetShaderGroupHandles(deviceIdx); const auto pLibGroupInfos = pPipelineLib->GetShaderGroupInfos(); - // update pipelineLibHasTraceRay and pipelineLibTraceRayVa - pipelineHasTraceRay = pPipelineLib->CheckHasTraceRay(); - if (pipelineHasTraceRay) + // update pipelineHasTraceRay and pipelineLibTraceRayVa + if (pPipelineLib->CheckHasTraceRay()) { + pipelineHasTraceRay = true; pipelineLibTraceRayVa = pPipelineLib->GetTraceRayGpuVa(deviceIdx); } @@ -1370,7 +1414,9 @@ VkResult RayTracingPipeline::CreateImpl( if (funcCount > 0) { const auto traceRayFuncIndex = funcCount - 1; - traceRayGpuVas[deviceIdx] = pIndirectFuncInfo[traceRayFuncIndex].gpuVirtAddr; + traceRayGpuVas[deviceIdx] = + pIndirectFuncInfo[traceRayFuncIndex].gpuVirtAddr | + pShaderProp[traceRayFuncIndex].shaderIdExtraBits; } else if (pipelineHasTraceRay) { @@ -1453,12 +1499,12 @@ VkResult RayTracingPipeline::CreateImpl( if (settings.enableDebugPrintf) { ClearFormatString(); - for (uint32_t i = 0; i < pipelineBinary[DefaultDeviceIndex].pipelineBinCount; ++i) + for (uint32_t i = 0; i < pipelineBinaries[DefaultDeviceIndex].pipelineBinCount; ++i) { DebugPrintf::DecodeFormatStringsFromElf( m_pDevice, - pipelineBinary[DefaultDeviceIndex].pPipelineBins[i].codeSize, - static_cast(pipelineBinary[DefaultDeviceIndex].pPipelineBins[i].pCode), + pipelineBinaries[DefaultDeviceIndex].pPipelineBins[i].codeSize, + static_cast(pipelineBinaries[DefaultDeviceIndex].pPipelineBins[i].pCode), GetFormatStrings()); } } @@ -1502,7 +1548,7 @@ VkResult RayTracingPipeline::CreateImpl( { m_pDevice->GetCompiler(deviceIdx)->FreeRayTracingPipelineBinary( &binaryCreateInfo, - &pipelineBinary[deviceIdx]); + &pipelineBinaries[deviceIdx]); } pAllocator->pfnFree(pAllocator->pUserData, pTempBuffer); @@ -1552,70 +1598,71 @@ static int32_t DeferredCreateRayTracingPipelineCallback( { case DeferredCallbackType::Join: { - uint32_t index = Util::AtomicIncrement(&pState->nextPending) - 1; - - const bool firstThread = (index == 0); - - // Run in a loop until we've processed all pipeline create infos. Parallel joins in their own loops can - // consume iterations. A single "main" thread per pipeline is sent out here. These threads will not return - // untill the pipeline has been fully created (unlike the helper worker threads). - while (index < pState->infoCount) + if (pState->nextPending < pState->infoCount) { - VkResult localResult = VK_SUCCESS; - const VkRayTracingPipelineCreateInfoKHR* pCreateInfo = &pState->pInfos[index]; - VkPipelineCreateFlags2KHR flags = - Device::GetPipelineCreateFlags(pCreateInfo); + uint32_t index = Util::AtomicIncrement(&pState->nextPending) - 1; - if (pState->skipRemaining == VK_FALSE) + // Run in a loop until we've processed all pipeline create infos. Parallel joins in their own loops can + // consume iterations. A single "main" thread per pipeline is sent out here. These threads will not return + // untill the pipeline has been fully created (unlike the helper worker threads). + while (index < pState->infoCount) { - RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pState->pPipelines[index]); - - localResult = pPipeline->CreateImpl(pState->pPipelineCache, - pCreateInfo, - flags, - pState->pAllocator, - pOperation->Workload(index)); + VkResult localResult = VK_SUCCESS; + const VkRayTracingPipelineCreateInfoKHR* pCreateInfo = &pState->pInfos[index]; + VkPipelineCreateFlags2KHR flags = + Device::GetPipelineCreateFlags(pCreateInfo); -#if ICD_GPUOPEN_DEVMODE_BUILD - if (localResult == VK_SUCCESS) + if (pState->skipRemaining == VK_FALSE) { - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pState->pPipelines[index]); + + localResult = pPipeline->CreateImpl(pState->pPipelineCache, + pCreateInfo, + flags, + pState->pAllocator, + pOperation->Workload(index)); - if (pDevMgr != nullptr) +#if ICD_GPUOPEN_DEVMODE_BUILD + if (localResult == VK_SUCCESS) { - pDevMgr->PipelineCreated(pDevice, pPipeline); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); - if (pPipeline->IsInlinedShaderEnabled() == false) + if (pDevMode != nullptr) { - pDevMgr->ShaderLibrariesCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); + + if (pPipeline->IsInlinedShaderEnabled() == false) + { + pDevMode->ShaderLibrariesCreated(pDevice, pPipeline); + } } } - } #endif - } - - if (localResult != VK_SUCCESS) - { - Util::AtomicCompareAndSwap(&pState->finalResult, - static_cast(VK_SUCCESS), - static_cast(localResult)); + } - if (pCreateInfo->flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) + if (localResult != VK_SUCCESS) { - Util::AtomicCompareAndSwap(&pState->skipRemaining, - VK_FALSE, - VK_TRUE); + Util::AtomicCompareAndSwap(&pState->finalResult, + static_cast(VK_SUCCESS), + static_cast(localResult)); + + if (pCreateInfo->flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) + { + Util::AtomicCompareAndSwap(&pState->skipRemaining, + VK_FALSE, + VK_TRUE); + } } - } - // If the workloads for this pipeline are still pending (after creation), then no-op them at this point - Util::AtomicCompareAndSwap(&pOperation->Workload(index)->totalInstances, - UINT_MAX, - 0); + // If the workloads for this pipeline are still pending (after creation), then no-op them at this point + Util::AtomicCompareAndSwap(&pOperation->Workload(index)->totalInstances, + UINT_MAX, + 0); - Util::AtomicIncrement(&pState->completed); + Util::AtomicIncrement(&pState->completed); - index = Util::AtomicIncrement(&pState->nextPending) - 1; + index = Util::AtomicIncrement(&pState->nextPending) - 1; + } } // Helper worker threads go through here. They assist the main pipeline threads. Currently, the only workloads diff --git a/icd/api/renderpass/renderpass_builder.cpp b/icd/api/renderpass/renderpass_builder.cpp index dec85cf6..8d23e42a 100644 --- a/icd/api/renderpass/renderpass_builder.cpp +++ b/icd/api/renderpass/renderpass_builder.cpp @@ -921,7 +921,9 @@ static void IncludeWaitPoint( } // ===================================================================================================================== -static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) +static void ConvertImplicitSyncsLegacy( + RPBarrierInfo* pBarrier, + const RuntimeSettings& settings) { pBarrier->implicitSrcCacheMask = 0; pBarrier->implicitDstCacheMask = 0; @@ -934,9 +936,6 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) IncludePipePoint(pBarrier, Pal::HwPipeBottom); IncludeWaitPoint(pBarrier, Pal::HwPipePreBlt); - pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; - pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; - pBarrier->implicitSrcCacheMask |= pBarrier->flags.preColorResolveSync ? Pal::CoherColorTarget : Pal::CoherDepthStencilTarget; @@ -950,8 +949,6 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) { IncludeWaitPoint(pBarrier, Pal::HwPipePreBlt); - pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_CLEAR_BIT_KHR; - pBarrier->implicitDstCacheMask |= Pal::CoherClear; } @@ -961,7 +958,57 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) IncludePipePoint(pBarrier, Pal::HwPipePostBlt); IncludeWaitPoint(pBarrier, Pal::HwPipeTop); - // Just going by the above wait point, the dstStageMask would be converted to TopOfPipe, but it is not optimal. + pBarrier->implicitSrcCacheMask |= Pal::CoherResolveSrc; + } + + if (pBarrier->flags.implicitExternalOutgoing && + (pBarrier->pipePointCount < (MaxHwPipePoints - 1)) && + settings.implicitExternalSynchronization) + { + // Since there is no handling of implicitExternalIncoming today, make this visible immediately. + IncludeWaitPoint(pBarrier, Pal::HwPipeTop); + + pBarrier->pipePoints[pBarrier->pipePointCount] = Pal::HwPipeBottom; + pBarrier->pipePointCount++; + + pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + } +} + +// ===================================================================================================================== +static void ConvertImplicitSyncs( + RPBarrierInfo* pBarrier, + const RuntimeSettings& settings) +{ + pBarrier->implicitSrcCacheMask = 0; + pBarrier->implicitDstCacheMask = 0; + + // Similarly augment the waiting if we need to wait for prior color rendering to finish + if (pBarrier->flags.preColorResolveSync || + pBarrier->flags.preDsResolveSync) + { + pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; + + pBarrier->implicitSrcCacheMask |= pBarrier->flags.preColorResolveSync ? Pal::CoherColorTarget : + Pal::CoherDepthStencilTarget; + pBarrier->implicitDstCacheMask |= Pal::CoherResolveDst; + } + + // Wait for (non-auto-synced) pre-clear if necessary. No need to augment the pipe point because the prior work falls + // under subpass dependency, but we may need to move the wait point forward to cover blts. + if (pBarrier->flags.preColorClearSync || + pBarrier->flags.preDsClearSync) + { + pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_CLEAR_BIT_KHR; + + pBarrier->implicitDstCacheMask |= Pal::CoherClear; + } + + // Augment the active source pipeline stages for resolves if we need to wait for prior resolves to complete + if (pBarrier->flags.postResolveSync) + { // TopOfPipe causes a stall at PFP which is not really needed for images. As an optimization for Acq-Rel // barriers we instead set dstStage to Blt here. pBarrier->srcStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; @@ -969,6 +1016,17 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) pBarrier->implicitSrcCacheMask |= Pal::CoherResolveSrc; } + + if (pBarrier->flags.implicitExternalOutgoing && + (pBarrier->pipePointCount < (MaxHwPipePoints - 1)) && + settings.implicitExternalSynchronization) + { + pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; + + pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + } } // ===================================================================================================================== @@ -977,35 +1035,14 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) void RenderPassBuilder::PostProcessSyncPoint( SyncPointState* pSyncPoint) { - // Convert subpass dependency execution scope to PAL pipe/wait point - pSyncPoint->barrier.waitPoint = VkToPalWaitPipePoint(pSyncPoint->barrier.dstStageMask); - - pSyncPoint->barrier.pipePointCount = VkToPalSrcPipePoints(pSyncPoint->barrier.srcStageMask, - pSyncPoint->barrier.pipePoints); - - // Include implicit waiting and cache access - ConvertImplicitSyncs(&pSyncPoint->barrier); - - if (pSyncPoint->barrier.flags.implicitExternalOutgoing && - (pSyncPoint->barrier.pipePointCount < (MaxHwPipePoints - 1)) && - m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetRuntimeSettings().implicitExternalSynchronization) - { - // Since there is no handling of implicitExternalIncoming today, make this visible immediately. - IncludeWaitPoint(&pSyncPoint->barrier, Pal::HwPipeTop); - - pSyncPoint->barrier.pipePoints[pSyncPoint->barrier.pipePointCount] = Pal::HwPipeBottom; - pSyncPoint->barrier.pipePointCount++; - - pSyncPoint->barrier.srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; - pSyncPoint->barrier.dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; - - pSyncPoint->barrier.srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - } + const RuntimeSettings& settings = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetRuntimeSettings(); if (m_pDevice->GetPalProperties().gfxipProperties.flags.supportReleaseAcquireInterface && - m_pDevice->GetRuntimeSettings().useAcquireReleaseInterface) + settings.useAcquireReleaseInterface) { + // Include implicit waiting and cache access + ConvertImplicitSyncs(&pSyncPoint->barrier, settings); + // Need a global cache transition if any of the sync flags are set or if there's an app // subpass dependency that requires cache synchronization. if (((pSyncPoint->barrier.srcAccessMask != 0) || @@ -1074,6 +1111,15 @@ void RenderPassBuilder::PostProcessSyncPoint( } else { + // Convert subpass dependency execution scope to PAL pipe/wait point + pSyncPoint->barrier.waitPoint = VkToPalWaitPipePoint(pSyncPoint->barrier.dstStageMask); + + pSyncPoint->barrier.pipePointCount = VkToPalSrcPipePoints(pSyncPoint->barrier.srcStageMask, + pSyncPoint->barrier.pipePoints); + + // Include implicit waiting and cache access + ConvertImplicitSyncsLegacy(&pSyncPoint->barrier, settings); + // Need a global cache transition if any of the sync flags are set or if there's an app // subpass dependency that requires cache synchronization. if ((pSyncPoint->barrier.srcAccessMask != 0) || diff --git a/icd/api/renderpass/renderpass_types.h b/icd/api/renderpass/renderpass_types.h index a485e618..6c7d4a92 100644 --- a/icd/api/renderpass/renderpass_types.h +++ b/icd/api/renderpass/renderpass_types.h @@ -109,8 +109,10 @@ struct RPBindTargetsInfo struct RPBarrierInfo { // The following fields are a composite of all VkSubpassDependencies that affect this particular barrier: - PipelineStageFlags srcStageMask; - PipelineStageFlags dstStageMask; + PipelineStageFlags srcStageMask; // VK-srcStageMask. This will be converted to appropriate Pal + // source stage mask when passing barrier info to PAL + PipelineStageFlags dstStageMask; // VK-dstStageMask. This will be converted to appropriate Pal + // dst stage mask when passing barrier info to PAL AccessFlags srcAccessMask; AccessFlags dstAccessMask; Pal::HwPipePoint waitPoint; diff --git a/icd/api/sqtt/sqtt_layer.cpp b/icd/api/sqtt/sqtt_layer.cpp index 347ef2a3..638ab2f4 100644 --- a/icd/api/sqtt/sqtt_layer.cpp +++ b/icd/api/sqtt/sqtt_layer.cpp @@ -273,14 +273,14 @@ SqttCmdBufferState::SqttCmdBufferState( : m_pCmdBuf(pCmdBuf), m_pSqttMgr(pCmdBuf->VkDevice()->GetSqttMgr()), - m_pDevModeMgr(pCmdBuf->VkDevice()->VkInstance()->GetDevModeMgr()), + m_pDevMode(pCmdBuf->VkDevice()->VkInstance()->GetDevModeMgr()), m_settings(pCmdBuf->VkDevice()->GetRuntimeSettings()), m_pNextLayer(m_pSqttMgr->GetNextLayer()), m_currentEntryPoint(RgpSqttMarkerGeneralApiType::Invalid), m_currentEventId(0), m_currentEventType(RgpSqttMarkerEventType::InternalUnknown), #if ICD_GPUOPEN_DEVMODE_BUILD - m_instructionTrace({ false, DevModeMgr::InvalidTargetPipelineHash, VK_PIPELINE_BIND_POINT_MAX_ENUM }), + m_instructionTrace({ false, IDevMode::InvalidTargetPipelineHash, VK_PIPELINE_BIND_POINT_MAX_ENUM }), #endif m_debugTags(pCmdBuf->VkInstance()->Allocator()) { @@ -319,9 +319,9 @@ void SqttCmdBufferState::Begin( m_currentEventId = 0; #if ICD_GPUOPEN_DEVMODE_BUILD - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_instructionTrace.targetHash = m_pDevModeMgr->GetInstructionTraceTargetHash(); + m_instructionTrace.targetHash = m_pDevMode->GetInstructionTraceTargetHash(); } #endif @@ -376,10 +376,10 @@ void SqttCmdBufferState::End() WriteCbEndMarker(); #if ICD_GPUOPEN_DEVMODE_BUILD - if ((m_pDevModeMgr != nullptr) && + if ((m_pDevMode != nullptr) && (m_instructionTrace.started)) { - m_pDevModeMgr->StopInstructionTrace(m_pCmdBuf); + m_pDevMode->StopInstructionTrace(m_pCmdBuf); m_instructionTrace.started = false; } #endif @@ -550,7 +550,7 @@ void SqttCmdBufferState::WriteUserEventMarker( // ==================================================================================================================== void SqttCmdBufferState::RgdAnnotateCmdBuf() { - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { Pal::RgdMarkerInfoCmdBufData info = {}; info.header.infoType = Pal::RgdMarkerInfoTypeCmdBufStart; @@ -573,7 +573,7 @@ void SqttCmdBufferState::RgdAnnotateDispatch( { // CrashAnalysis already insert marker for all dispatches on PAL side. Here, we just provide additional context for // the described dispatch. - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { if ((type == RgpSqttMarkerEventType::CmdDispatch) || (type == RgpSqttMarkerEventType::CmdDispatchIndirect) @@ -608,7 +608,7 @@ void SqttCmdBufferState::RgdAnnotateDraw( { // CrashAnalysis already insert marker for all draws that comes from application on PAL side. Here, we just provide // additional context for the described draw. - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { if ((type == RgpSqttMarkerEventType::CmdDraw) || (type == RgpSqttMarkerEventType::CmdDrawIndexed)) { @@ -634,7 +634,7 @@ void SqttCmdBufferState::RgdInsertBarrierBeginMarker( Pal::Developer::BarrierType type, // Barrier type uint32 reason) // Reason for the barrier { - if (m_pDevModeMgr->IsCrashAnalysisEnabled() && + if (m_pDevMode->IsCrashAnalysisEnabled() && (m_currentEventType == RgpSqttMarkerEventType::CmdPipelineBarrier)) { Pal::RgdMarkerInfoBarrierBeginData info = {}; @@ -654,7 +654,7 @@ void SqttCmdBufferState::RgdInsertBarrierEndMarker( Pal::Developer::BarrierOperations operations) // What the barrier does { // CrashAnalysisCmdBuffer does not insert marker for Barrier. We insert as MarkerSource::Pal here. - if (m_pDevModeMgr->IsCrashAnalysisEnabled() && + if (m_pDevMode->IsCrashAnalysisEnabled() && (m_currentEventType == RgpSqttMarkerEventType::CmdPipelineBarrier)) { Pal::RgdMarkerInfoBarrierEndData info = {}; @@ -1049,12 +1049,12 @@ void SqttCmdBufferState::PipelineBound( const Pipeline* pPipeline = Pipeline::BaseObjectFromHandle(pipeline); #if ICD_GPUOPEN_DEVMODE_BUILD - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { if ((m_instructionTrace.started == false) && (pPipeline->GetApiHash() == m_instructionTrace.targetHash)) { - m_pDevModeMgr->StartInstructionTrace(m_pCmdBuf); + m_pDevMode->StartInstructionTrace(m_pCmdBuf); m_instructionTrace.bindPoint = bindPoint; m_instructionTrace.started = true; } @@ -2231,14 +2231,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); VkResult result = SQTT_CALL_NEXT_LAYER(vkCreateGraphicsPipelines)(device, pipelineCache, createInfoCount, pCreateInfos, pAllocator, pPipelines); if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (result == VK_SUCCESS) && - (pDevMgr != nullptr)) + (pDevMode != nullptr)) { for (uint32_t i = 0; i < createInfoCount; ++i) { @@ -2263,7 +2263,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines( } #if ICD_GPUOPEN_DEVMODE_BUILD - pDevMgr->PipelineCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); #endif } } @@ -2283,14 +2283,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); VkResult result = SQTT_CALL_NEXT_LAYER(vkCreateComputePipelines)(device, pipelineCache, createInfoCount, pCreateInfos, pAllocator, pPipelines); if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (result == VK_SUCCESS) && - (pDevMgr != nullptr)) + (pDevMode != nullptr)) { for (uint32_t i = 0; i < createInfoCount; ++i) { @@ -2311,7 +2311,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines( } #if ICD_GPUOPEN_DEVMODE_BUILD - pDevMgr->PipelineCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); #endif } } @@ -2333,14 +2333,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateRayTracingPipelinesKHR( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); VkResult result = SQTT_CALL_NEXT_LAYER(vkCreateRayTracingPipelinesKHR)(device, deferredOperation, pipelineCache, createInfoCount, pCreateInfos, pAllocator, pPipelines); if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && ((result == VK_SUCCESS) || (result == VK_OPERATION_DEFERRED_KHR)) && - (pDevMgr != nullptr)) + (pDevMode != nullptr)) { for (uint32_t i = 0; i < createInfoCount; ++i) { @@ -2367,11 +2367,11 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateRayTracingPipelinesKHR( #if ICD_GPUOPEN_DEVMODE_BUILD if (result != VK_OPERATION_DEFERRED_KHR) { - pDevMgr->PipelineCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); if (pPipeline->IsInlinedShaderEnabled() == false) { - pDevMgr->ShaderLibrariesCreated(pDevice, pPipeline); + pDevMode->ShaderLibrariesCreated(pDevice, pPipeline); } } #endif @@ -2534,16 +2534,16 @@ VKAPI_ATTR void VKAPI_CALL vkDestroyPipeline( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); #if ICD_GPUOPEN_DEVMODE_BUILD - if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (pDevMgr != nullptr)) + if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (pDevMode != nullptr)) { if (VK_NULL_HANDLE != pipeline) { Pipeline* pPipeline = Pipeline::BaseObjectFromHandle(pipeline); - pDevMgr->PipelineDestroyed(pDevice, pPipeline); + pDevMode->PipelineDestroyed(pDevice, pPipeline); #if VKI_RAY_TRACING if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) @@ -2552,7 +2552,7 @@ VKAPI_ATTR void VKAPI_CALL vkDestroyPipeline( if (pRtPipeline->IsInlinedShaderEnabled() == false) { - pDevMgr->ShaderLibrariesDestroyed(pDevice, pRtPipeline); + pDevMode->ShaderLibrariesDestroyed(pDevice, pRtPipeline); } } #endif @@ -2710,7 +2710,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkSetDebugUtilsObjectTagEXT( // calls but still want to start/stop RGP tracing. static void CheckRGPFrameBegin( Queue* pQueue, - DevModeMgr* pDevMode, + IDevMode* pDevMode, uint32_t submitCount, const VkSubmitInfo* pSubmits) { @@ -2732,7 +2732,7 @@ static void CheckRGPFrameBegin( if (pCmdBuf->HasDebugTag(frameBeginTag)) { - pDevMode->NotifyFrameBegin(pQueue, DevModeMgr::FrameDelimiterType::CmdBufferTag); + pDevMode->NotifyFrameBegin(pQueue, IDevMode::FrameDelimiterType::CmdBufferTag); return; } @@ -2745,7 +2745,7 @@ static void CheckRGPFrameBegin( // Looks for markers in a submitted command buffer to identify a forced end to an RGP trace. See CheckRGPFrameBegin(). static void CheckRGPFrameEnd( Queue* pQueue, - DevModeMgr* pDevMode, + IDevMode* pDevMode, uint32_t submitCount, const VkSubmitInfo* pSubmits) { @@ -2767,7 +2767,7 @@ static void CheckRGPFrameEnd( if (pCmdBuf->HasDebugTag(frameEndTag)) { - pDevMode->NotifyFrameEnd(pQueue, DevModeMgr::FrameDelimiterType::CmdBufferTag); + pDevMode->NotifyFrameEnd(pQueue, IDevMode::FrameDelimiterType::CmdBufferTag); return; } @@ -2786,7 +2786,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit( { Queue* pQueue = ApiQueue::ObjectFromHandle(queue); SqttMgr* pSqtt = pQueue->VkDevice()->GetSqttMgr(); - DevModeMgr* pDevMode = pQueue->VkDevice()->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pQueue->VkDevice()->VkInstance()->GetDevModeMgr(); #if ICD_GPUOPEN_DEVMODE_BUILD pDevMode->NotifyPreSubmit(); diff --git a/icd/api/sqtt/sqtt_layer.h b/icd/api/sqtt/sqtt_layer.h index da9bbdc8..9d93e2f0 100644 --- a/icd/api/sqtt/sqtt_layer.h +++ b/icd/api/sqtt/sqtt_layer.h @@ -56,7 +56,7 @@ class ImageView; class RenderPass; class SqttMgr; class Pipeline; -class DevModeMgr; +class IDevMode; // Contains parameters that are happening when renderpass targets are bound in the driver. struct SqttBindTargetParams @@ -207,7 +207,7 @@ class SqttCmdBufferState CmdBuffer* m_pCmdBuf; SqttMgr* m_pSqttMgr; // Per-device SQTT state - DevModeMgr* m_pDevModeMgr; + IDevMode* m_pDevMode; const RuntimeSettings& m_settings; const DispatchTable* m_pNextLayer; // Pointer to next layer's dispatch table RgpSqttMarkerCbID m_cbId; // Command buffer ID associated with this command buffer diff --git a/icd/api/strings/entry_points.txt b/icd/api/strings/entry_points.txt index b3a9763a..483270db 100644 --- a/icd/api/strings/entry_points.txt +++ b/icd/api/strings/entry_points.txt @@ -531,6 +531,17 @@ vkGetDeviceFaultInfoEXT @device @dext(EXT_devi vkGetShaderModuleIdentifierEXT @device @dext(EXT_shader_module_identifier) vkGetShaderModuleCreateInfoIdentifierEXT @device @dext(EXT_shader_module_identifier) +vkCreateIndirectCommandsLayoutNV @device @dext(NV_device_generated_commands) +vkDestroyIndirectCommandsLayoutNV @device @dext(NV_device_generated_commands) +vkGetGeneratedCommandsMemoryRequirementsNV @device @dext(NV_device_generated_commands) +vkCmdPreprocessGeneratedCommandsNV @device @dext(NV_device_generated_commands) +vkCmdExecuteGeneratedCommandsNV @device @dext(NV_device_generated_commands) +vkCmdBindPipelineShaderGroupNV @device @dext(NV_device_generated_commands) + +vkCmdUpdatePipelineIndirectBufferNV @device @dext(NV_device_generated_commands_compute) +vkGetPipelineIndirectDeviceAddressNV @device @dext(NV_device_generated_commands_compute) +vkGetPipelineIndirectMemoryRequirementsNV @device @dext(NV_device_generated_commands_compute) + vkCmdSetTessellationDomainOriginEXT @device @dext(EXT_extended_dynamic_state3) vkCmdSetDepthClampEnableEXT @device @dext(EXT_extended_dynamic_state3) vkCmdSetPolygonModeEXT @device @dext(EXT_extended_dynamic_state3) diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt index 0642bf17..86ba3001 100644 --- a/icd/api/strings/extensions.txt +++ b/icd/api/strings/extensions.txt @@ -207,10 +207,13 @@ VK_EXT_attachment_feedback_loop_layout VK_EXT_physical_device_drm VK_KHR_cooperative_matrix VK_EXT_texture_compression_astc_hdr +VK_NV_device_generated_commands +VK_NV_device_generated_commands_compute VK_EXT_image_drm_format_modifier VK_KHR_shader_expect_assume VK_KHR_shader_subgroup_rotate VK_KHR_shader_quad_control +VK_EXT_nested_command_buffer VK_KHR_dynamic_rendering_local_read VK_KHR_vertex_attribute_divisor VK_EXT_frame_boundary diff --git a/icd/api/strings/generate_strings.py b/icd/api/strings/generate_strings.py index 9fbe54a6..0eea3b1e 100644 --- a/icd/api/strings/generate_strings.py +++ b/icd/api/strings/generate_strings.py @@ -149,7 +149,7 @@ def generate_entry_point_condition(f, name, cond): def get_compile_condition(cond): """Assemble condition macro name""" cond = cond.replace('@none', '') - cond = cond.replace('@win32', '_WIN32') + cond = cond.replace('@win32', 'defined(_WIN32)') core = re.compile(r'@core(?:_build_only)?\( ( [^\.]* ) \. ( [^\)]* ) \)', re.VERBOSE) cond = core.sub(r'VK_VERSION_\1_\2', cond) diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index 10649402..8170a40f 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -43,6 +43,7 @@ #include "include/vk_utils.h" #include "include/vk_query.h" #include "include/vk_queue.h" +#include "include/vk_indirect_commands_layout.h" #if VKI_RAY_TRACING #include "raytrace/vk_acceleration_structure.h" @@ -599,6 +600,16 @@ CmdBuffer::CmdBuffer( m_reverseThreadGroupState(false) #if VKI_RAY_TRACING , m_scratchVidMemList(pDevice->VkInstance()->Allocator()) + , m_maxCpsMemSize(0) + , m_patchCpsList + { + pDevice->VkInstance()->Allocator(), +#if VKI_BUILD_MAX_NUM_GPUS > 1 + pDevice->VkInstance()->Allocator(), + pDevice->VkInstance()->Allocator(), + pDevice->VkInstance()->Allocator() +#endif + } #endif { m_flags.wasBegun = false; @@ -1310,6 +1321,8 @@ VkResult CmdBuffer::Begin( #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); + + m_maxCpsMemSize = 0; #endif const PhysicalDevice* pPhysicalDevice = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex); @@ -1854,6 +1867,7 @@ VkResult CmdBuffer::Reset(VkCommandBufferResetFlags flags) #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); + FreePatchCpsList(); #endif result = PalToVkResult(PalCmdBufferReset(releaseResources)); @@ -2337,6 +2351,7 @@ VkResult CmdBuffer::Destroy(void) #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); + FreePatchCpsList(); #endif @@ -2956,6 +2971,8 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( const VkBuffer buffer = pBuffers[inputIdx]; const VkDeviceSize offset = pOffsets[inputIdx]; + bool padVertexBuffers = m_flags.padVertexBuffers; + if (buffer != VK_NULL_HANDLE) { const Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); @@ -2964,6 +2981,12 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( if ((pSizes != nullptr) && (pSizes[inputIdx] != VK_WHOLE_SIZE)) { pBinding->range = pSizes[inputIdx]; + + if (offset != 0) + { + padVertexBuffers = true; + } + } else { @@ -2981,7 +3004,7 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( pBinding->stride = pStrides[inputIdx]; } - if (m_flags.padVertexBuffers && (pBinding->stride != 0)) + if (padVertexBuffers && (pBinding->stride != 0)) { pBinding->range = Util::RoundUpToMultiple(pBinding->range, pBinding->stride); } @@ -3112,7 +3135,7 @@ void CmdBuffer::Draw( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif { @@ -3139,7 +3162,7 @@ void CmdBuffer::DrawIndexed( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif { @@ -3155,7 +3178,7 @@ void CmdBuffer::DrawIndexed( } // ===================================================================================================================== -template< bool indexed, bool useBufferCount> +template void CmdBuffer::DrawIndirect( VkBuffer buffer, VkDeviceSize offset, @@ -3169,7 +3192,7 @@ void CmdBuffer::DrawIndirect( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); @@ -3217,6 +3240,51 @@ void CmdBuffer::DrawIndirect( DbgBarrierPostCmd((indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect); } +// ===================================================================================================================== +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa) +{ + DbgBarrierPreCmd((indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect); + + ValidateGraphicsStates(); + +#if VKI_RAY_TRACING + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); +#endif + + VK_ASSERT(stride <= indirectBufferSize); + Pal::GpuVirtAddrAndStride gpuVirtAddrAndStride = { indirectBufferVa, stride }; + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + + if (indexed == false) + { + PalCmdBuffer(deviceIdx)->CmdDrawIndirectMulti( + gpuVirtAddrAndStride, + count, + useBufferCount? countBufferVa : 0); + } + else + { + PalCmdBuffer(deviceIdx)->CmdDrawIndexedIndirectMulti( + gpuVirtAddrAndStride, + count, + useBufferCount ? countBufferVa : 0); + } + } + while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd((indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect); +} + // ===================================================================================================================== void CmdBuffer::DrawMeshTasks( uint32_t x, @@ -3230,7 +3298,7 @@ void CmdBuffer::DrawMeshTasks( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif PalCmdDrawMeshTasks(x, y, z); @@ -3254,7 +3322,7 @@ void CmdBuffer::DrawMeshTasksIndirect( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif PalCmdDrawMeshTasksIndirect(buffer, offset, count, stride, countBuffer, countOffset); @@ -3262,6 +3330,40 @@ void CmdBuffer::DrawMeshTasksIndirect( DbgBarrierPostCmd(DbgBarrierDrawMeshTasksIndirect); } +// ===================================================================================================================== +template +void CmdBuffer::DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa) +{ + DbgBarrierPreCmd(DbgBarrierDrawMeshTasksIndirect); + + ValidateGraphicsStates(); + +#if VKI_RAY_TRACING + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); +#endif + + VK_ASSERT(stride <= indirectBufferSize); + + Pal::GpuVirtAddrAndStride gpuVirtAddrAndStride = { indirectBufferVa, stride }; + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + PalCmdBuffer(deviceGroup.Index())->CmdDispatchMeshIndirectMulti( + gpuVirtAddrAndStride, + count, + useBufferCount? countBufferVa : 0); + + } while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd(DbgBarrierDrawMeshTasksIndirect); +} + // ===================================================================================================================== void CmdBuffer::Dispatch( uint32_t x, @@ -3276,7 +3378,7 @@ void CmdBuffer::Dispatch( } #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, x, y, z, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, x, y, z, nullptr, 0, 0); #endif if (m_pDevice->GetRuntimeSettings().enableAlternatingThreadGroupOrder) @@ -3307,7 +3409,7 @@ void CmdBuffer::DispatchOffset( #if VKI_RAY_TRACING BindRayQueryConstants( - m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, dim_x, dim_y, dim_z, nullptr, 0); + m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, dim_x, dim_y, dim_z, nullptr, 0, 0); #endif PalCmdDispatchOffset(base_x, base_y, base_z, dim_x, dim_y, dim_z); @@ -3336,7 +3438,8 @@ void CmdBuffer::DispatchIndirect( 0, 0, pBuffer, - offset); + offset, + 0); #endif PalCmdDispatchIndirect(pBuffer, offset); @@ -3344,6 +3447,51 @@ void CmdBuffer::DispatchIndirect( DbgBarrierPostCmd(DbgBarrierDispatchIndirect); } +// ===================================================================================================================== +void CmdBuffer::DispatchIndirect( + VkDeviceSize indirectBufferVa) +{ + DbgBarrierPreCmd(DbgBarrierDispatchIndirect); + + if (PalPipelineBindingOwnedBy(Pal::PipelineBindPoint::Compute, PipelineBindCompute) == false) + { + RebindPipeline(); + } + +#if VKI_RAY_TRACING + BindRayQueryConstants( + m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, 0, 0, 0, nullptr, 0, indirectBufferVa); +#endif + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + PalCmdBuffer(deviceIdx)->CmdDispatchIndirect(indirectBufferVa); + } + while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd(DbgBarrierDispatchIndirect); +} + +// ===================================================================================================================== +void CmdBuffer::ExecuteIndirect( + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pInfo) +{ + IndirectCommandsLayout* pLayout = IndirectCommandsLayout::ObjectFromHandle(pInfo->indirectCommandsLayout); + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + pLayout->BindPreprocessBuffer(pInfo->preprocessBuffer, + pInfo->preprocessOffset, + deviceIdx); + } + while (deviceGroup.IterateNext()); +} + // ===================================================================================================================== // Performs a color clear (vkCmdClearColorImage) void CmdBuffer::ClearColorImage( @@ -3410,7 +3558,7 @@ void CmdBuffer::ClearColorImage( pPalRanges, 0, nullptr, - settings.enableColorClearAutoSync ? Pal::ColorClearAutoSync : 0); + settings.enableColorClearAutoSync ? static_cast(Pal::ColorClearAutoSync) : 0); } virtStackFrame.FreeArray(pPalRanges); @@ -4882,7 +5030,7 @@ void CmdBuffer::PostDrawPreResolveSync() barrierInfo.transitionCount = 1; barrierInfo.pTransitions = &transition; - PalCmdBuffer(DefaultDeviceIndex)->CmdBarrier(barrierInfo); + PalCmdBarrier(barrierInfo, m_curDeviceMask); } // ===================================================================================================================== @@ -8027,9 +8175,7 @@ void CmdBuffer::RPSyncPoint( // Execute the barrier if it actually did anything if ((acquireReleaseInfo.dstGlobalStageMask != Pal::PipelineStageBottomOfPipe) || - ((acquireReleaseInfo.imageBarrierCount > 0) && isDstStageNotBottomOfPipe) || - ((rpBarrier.pipePointCount > 1) || - ((rpBarrier.pipePointCount == 1) && (rpBarrier.pipePoints[0] != Pal::HwPipeTop)))) + ((acquireReleaseInfo.imageBarrierCount > 0) && isDstStageNotBottomOfPipe)) { PalCmdReleaseThenAcquire( &acquireReleaseInfo, @@ -9797,49 +9943,48 @@ void CmdBuffer::DbgCmdBarrier(bool preCmd) (static_cast(Pal::CacheCoherencyUsageFlags::CoherPresent) == CoherPresent)), "The PAL::CacheCoherencyUsageFlags enum has changed. Vulkan settings might need to be updated."); - Pal::HwPipePoint waitPoint; - Pal::HwPipePoint signalPoint; + uint32_t srcStageMask; + uint32_t dstStageMask; + uint32_t srcCacheMask; uint32_t dstCacheMask; if (preCmd) { - waitPoint = static_cast(settings.dbgBarrierPreWaitPipePoint); - signalPoint = static_cast(settings.dbgBarrierPreSignalPipePoint); + dstStageMask = ConvertWaitPointToPipeStage( + static_cast(settings.dbgBarrierPreWaitPipePoint)); + srcStageMask = ConvertPipePointToPipeStage( + static_cast(settings.dbgBarrierPreSignalPipePoint)); srcCacheMask = settings.dbgBarrierPreCacheSrcMask; dstCacheMask = settings.dbgBarrierPreCacheDstMask; } else { - waitPoint = static_cast(settings.dbgBarrierPostWaitPipePoint); - signalPoint = static_cast(settings.dbgBarrierPostSignalPipePoint); + dstStageMask = ConvertWaitPointToPipeStage( + static_cast(settings.dbgBarrierPostWaitPipePoint)); + srcStageMask = ConvertPipePointToPipeStage( + static_cast(settings.dbgBarrierPostSignalPipePoint)); srcCacheMask = settings.dbgBarrierPostCacheSrcMask; dstCacheMask = settings.dbgBarrierPostCacheDstMask; } - Pal::BarrierInfo barrier = {}; + Pal::AcquireReleaseInfo barrier = {}; - barrier.reason = RgpBarrierUnknownReason; // This code is debug-only code. - barrier.waitPoint = waitPoint; + barrier.reason = RgpBarrierUnknownReason; // This code is debug-only code. + barrier.dstGlobalStageMask = dstStageMask; - if (waitPoint != Pal::HwPipeTop || signalPoint != Pal::HwPipeTop) + if ((dstStageMask != Pal::PipelineStageTopOfPipe) || (srcStageMask != Pal::PipelineStageTopOfPipe)) { - barrier.pipePointWaitCount = 1; - barrier.pPipePoints = &signalPoint; + barrier.srcGlobalStageMask = srcStageMask; } - Pal::BarrierTransition transition = {}; - if (srcCacheMask != 0 || dstCacheMask != 0) { - transition.srcCacheMask = srcCacheMask; - transition.dstCacheMask = dstCacheMask; - - barrier.transitionCount = 1; - barrier.pTransitions = &transition; + barrier.srcGlobalAccessMask = srcCacheMask; + barrier.dstGlobalAccessMask = dstCacheMask; } - PalCmdBarrier(barrier, m_curDeviceMask); + PalCmdReleaseThenAcquire(barrier, m_curDeviceMask); } #endif @@ -10055,7 +10200,7 @@ void CmdBuffer::DrawIndirectByteCount( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif utils::IterateMask deviceGroup(m_curDeviceMask); @@ -10456,12 +10601,55 @@ void CmdBuffer::TraceRays( while (deviceGroup.IterateNext()); } +// ===================================================================================================================== +void CmdBuffer::AddPatchCpsRequest( + uint32_t deviceIdx, + GpuRt::DispatchRaysConstants* pConstsMem, + uint64_t bufSize) +{ + VK_ASSERT(pConstsMem != nullptr); + m_maxCpsMemSize = Util::Max(m_maxCpsMemSize, bufSize); + Pal::Result result = m_patchCpsList[deviceIdx].PushBack(pConstsMem); + VK_ASSERT(result == Pal::Result::Success); +} + +// ===================================================================================================================== +void CmdBuffer::FreePatchCpsList() +{ + utils::IterateMask deviceGroup(m_curDeviceMask); + + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + m_patchCpsList[deviceIdx].Clear(); + } + while (deviceGroup.IterateNext()); +} + +// ===================================================================================================================== +// Fill bufVa to each patch request (call this at execute time). +void CmdBuffer::ApplyPatchCpsRequests( + uint32_t deviceIdx, + const Pal::IGpuMemory& cpsMem) const +{ + for (PatchCpsVector::Iter iter = m_patchCpsList[deviceIdx].Begin(); iter.Get() != nullptr; iter.Next()) + { + GpuRt::DispatchRaysConstants* pConstsMem = iter.Get(); + + m_pDevice->RayTrace()->GpuRt(deviceIdx)->PatchDispatchRaysConstants( + pConstsMem, + cpsMem.Desc().gpuVirtAddr, + m_maxCpsMemSize); + } +} + // ===================================================================================================================== void CmdBuffer::GetRayTracingDispatchArgs( uint32_t deviceIdx, const RuntimeSettings& settings, CmdPool* pCmdPool, const RayTracingPipeline* pPipeline, + uint32* pConstMem, Pal::gpusize constGpuAddr, uint32_t width, uint32_t height, @@ -10509,8 +10697,16 @@ void CmdBuffer::GetRayTracingDispatchArgs( pConstants->constData.cpsBackendStackSize = stackSizes.backendSize; if (settings.cpsFlags & CpsFlagStackInGlobalMem) { - // TODO: Record Cps stack requirement, create Cps stack at queue submission, and fill - // pConstants->constData.cpsGlobalMemoryAddressLo/Hi + const uint32 numRays = width * height * depth; + + const gpusize cpsMemorySize = m_pDevice->RayTrace()->GpuRt(deviceIdx)->GetCpsMemoryBytes( + stackSizes.frontendSize, + numRays); + + AddPatchCpsRequest( + deviceIdx, + reinterpret_cast(pConstMem), + cpsMemorySize); } } @@ -10559,9 +10755,9 @@ void CmdBuffer::TraceRayPreSetup( const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); const RayTracingPipeline* pPipeline = m_allGpuState.pRayTracingPipeline; - void* pConstData = PalCmdBuffer(deviceIdx)->CmdAllocateEmbeddedData(GpuRt::DispatchRaysConstantsDw, - 1, - pConstGpuAddr); + uint32* pConstMem = PalCmdBuffer(deviceIdx)->CmdAllocateEmbeddedData(GpuRt::DispatchRaysConstantsDw, + 1, + pConstGpuAddr); GpuRt::DispatchRaysConstants constants = {}; @@ -10569,6 +10765,7 @@ void CmdBuffer::TraceRayPreSetup( settings, m_pCmdPool, pPipeline, + pConstMem, *pConstGpuAddr, width, height, @@ -10579,7 +10776,7 @@ void CmdBuffer::TraceRayPreSetup( callableShaderBindingTable, &constants); - memcpy(pConstData, &constants, sizeof(constants)); + memcpy(pConstMem, &constants, sizeof(constants)); } // ===================================================================================================================== @@ -10958,7 +11155,8 @@ void CmdBuffer::BindRayQueryConstants( uint32_t height, uint32_t depth, Buffer* pIndirectBuffer, - VkDeviceSize indirectOffset) + VkDeviceSize indirectOffset, + VkDeviceSize indirectBufferVirtAddr) { if ((pPipeline != nullptr) && pPipeline->HasRayTracing()) { @@ -11004,7 +11202,7 @@ void CmdBuffer::BindRayQueryConstants( gpusize indirectBufferVa = (pIndirectBuffer != nullptr) ? pIndirectBuffer->GpuVirtAddr(deviceIdx) + indirectOffset : - 0; + indirectBufferVirtAddr; if (indirectBufferVa == 0) { @@ -11078,10 +11276,10 @@ void CmdBuffer::InsertDebugMarker( #if ICD_GPUOPEN_DEVMODE_BUILD constexpr uint8 MarkerSourceApplication = 0; - const DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + const IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); // Insert Crash Analysis markers if requested - if ((pDevModeMgr != nullptr) && (pDevModeMgr->IsCrashAnalysisEnabled())) + if ((pDevMode != nullptr) && (pDevMode->IsCrashAnalysisEnabled())) { PalCmdBuffer(DefaultDeviceIndex)->CmdInsertExecutionMarker(isBegin, MarkerSourceApplication, @@ -11374,6 +11572,7 @@ void CmdBuffer::ValidateGraphicsStates() params.pPipeline = pGraphicsPipeline->GetPalPipeline(deviceIdx); params.gfxDynState = m_allGpuState.pipelineState[PipelineBindGraphics].dynamicBindInfo.gfxDynState; + params.gfxShaderInfo = pGraphicsPipeline->GetBindInfo(); if (params.gfxDynState.enable.depthClampMode && (params.gfxDynState.enable.depthClipMode == false)) { @@ -11463,16 +11662,16 @@ void CmdBuffer::ValidateGraphicsStates() Device::SetDefaultVrsRateParams(&vrsRate); } - if (m_allGpuState.minSampleShading > 0.0) + // Both MSAA and VRS would utilize the value of PS_ITER_SAMPLES + // Thus, choose the min combiner (i.e. choose the higher quality rate) when both features are + // enabled + if ((m_allGpuState.msaaCreateInfo.pixelShaderSamples > 1) && + (m_allGpuState.vrsRate.flags.exposeVrsPixelsMask == 1) && + (pGraphicsPipeline != nullptr) && + (pGraphicsPipeline->GetPipelineFlags().shadingRateUsedInShader == false)) { - if ((m_allGpuState.vrsRate.shadingRate == Pal::VrsShadingRate::_1x1) && - (pGraphicsPipeline != nullptr) && - (pGraphicsPipeline->GetPipelineFlags().shadingRateUsedInShader == false) && - pGraphicsPipeline->ContainsDynamicState(DynamicStatesInternal::FragmentShadingRateStateKhr)) - { - vrsRate.combinerState[static_cast(Pal::VrsCombinerStage::PsIterSamples)] = - Pal::VrsCombiner::Override; - } + vrsRate.combinerState[static_cast(Pal::VrsCombinerStage::PsIterSamples)] = + Pal::VrsCombiner::Min; } PalCmdBuffer(deviceIdx)->CmdSetPerDrawVrsRate(vrsRate); @@ -12655,6 +12854,38 @@ void CmdBuffer::DrawIndirect( VkBuffer countBuffer, VkDeviceSize countOffset); +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + template void CmdBuffer::DrawMeshTasksIndirect( VkBuffer buffer, @@ -12673,6 +12904,22 @@ void CmdBuffer::DrawMeshTasksIndirect( VkBuffer countBuffer, VkDeviceSize countOffset); +template +void CmdBuffer::DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + template void CmdBuffer::ResolveImage( VkImage srcImage, diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index 2885aa4a..d2965fe0 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -105,9 +105,10 @@ VkResult ComputePipeline::CreatePipelineBinaries( Vkgc::BinaryData* pPipelineBinaries, PipelineMetadata* pBinaryMetadata) { - VkResult result = VK_SUCCESS; - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); + VkResult result = VK_SUCCESS; + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); + bool storeBinaryToCache = true; // Load or create the pipeline binary PipelineBinaryCache* pPipelineBinaryCache = (pPipelineCache != nullptr) ? pPipelineCache->GetPipelineCache() @@ -121,7 +122,7 @@ VkResult ComputePipeline::CreatePipelineBinaries( if (shouldCompile) { - bool skipCacheQuery = settings.enablePipelineDump; + bool skipCacheQuery = false; if (skipCacheQuery == false) { @@ -140,21 +141,60 @@ VkResult ComputePipeline::CreatePipelineBinaries( } } - // Compile if unable to retrieve from cache if (shouldCompile) { - if (pBinaryCreateInfo->pTempBuffer == nullptr) + if ((pDevice->GetRuntimeSettings().ignoreFlagFailOnPipelineCompileRequired == false) && + (flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)) { - result = pDefaultCompiler->ConvertComputePipelineInfo( - pDevice, - pCreateInfo, - pShaderInfo, - pPipelineOptimizerKey, - pBinaryMetadata, - pBinaryCreateInfo, - flags); + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + } + + bool shouldConvert = (pCreateInfo != nullptr) && + (pDevice->GetRuntimeSettings().enablePipelineDump || + (shouldCompile && (pBinaryCreateInfo->pTempBuffer == nullptr))); + + VkResult convertResult = VK_ERROR_UNKNOWN; + if (shouldConvert) + { + convertResult = pDefaultCompiler->ConvertComputePipelineInfo( + pDevice, + pCreateInfo, + pShaderInfo, + pPipelineOptimizerKey, + pBinaryMetadata, + pBinaryCreateInfo, + flags); + result = (result == VK_SUCCESS) ? convertResult : result; + } + + if ((result == VK_SUCCESS) && (convertResult == VK_SUCCESS) && shouldCompile) + { + if (IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.cs)) + { + result = VK_ERROR_UNKNOWN; + } + } + + if (pDevice->GetRuntimeSettings().enablePipelineDump && (convertResult == VK_SUCCESS)) + { + if ((shouldCompile == false) || (result != VK_SUCCESS)) + { + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pipelineInfo.pComputeInfo = &pBinaryCreateInfo->pipelineInfo; + pDefaultCompiler->DumpPipeline( + pDevice->GetRuntimeSettings(), + pipelineInfo, + pBinaryCreateInfo->apiPsoHash, + 1, + &pPipelineBinaries[deviceIdx], + result); } + } + // Compile if unable to retrieve from cache + if (shouldCompile) + { if (result == VK_SUCCESS) { result = pDevice->GetCompiler(deviceIdx)->CreateComputePipelineBinary( @@ -185,7 +225,8 @@ VkResult ComputePipeline::CreatePipelineBinaries( } // Add to any cache layer where missing - if (result == VK_SUCCESS) + if ((result == VK_SUCCESS) && storeBinaryToCache) + { pDevice->GetCompiler(deviceIdx)->CachePipelineBinary( &pCacheIds[deviceIdx], @@ -330,45 +371,48 @@ VkResult ComputePipeline::Create( uint64 startTimeTicks = Util::GetPerfCpuTime(); // Setup PAL create info from Vulkan inputs - Vkgc::BinaryData pipelineBinaries[MaxPalDevices] = {}; - Util::MetroHash::Hash cacheId[MaxPalDevices] = {}; - PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - ComputePipelineBinaryCreateInfo binaryCreateInfo = {}; - PipelineOptimizerKey pipelineOptimizerKey = {}; - ShaderOptimizerKey shaderOptimizerKey = {}; - ShaderModuleHandle tempModule = {}; - VkResult result = VK_SUCCESS; - PipelineMetadata binaryMetadata = {}; - ComputePipelineExtStructs extStructs = {}; + Vkgc::BinaryData pipelineBinaries[MaxPalDevices] = {}; + Util::MetroHash::Hash cacheId[MaxPalDevices] = {}; + PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + ComputePipelineBinaryCreateInfo binaryCreateInfo = {}; + PipelineOptimizerKey pipelineOptimizerKey = {}; + ShaderOptimizerKey shaderOptimizerKey = {}; + ShaderModuleHandle tempModule = {}; + VkResult result = VK_SUCCESS; + PipelineMetadata binaryMetadata = {}; + ComputePipelineExtStructs extStructs = {}; + bool binariesProvided = false; HandleExtensionStructs(pCreateInfo, &extStructs); ComputePipelineShaderStageInfo shaderInfo = {}; uint64_t apiPsoHash = {}; - // 1. Create Cache IDs - result = ComputePipeline::CreateCacheId( - pDevice, - pCreateInfo, - flags, - &shaderInfo, - &binaryCreateInfo, - &shaderOptimizerKey, - &pipelineOptimizerKey, - &apiPsoHash, - &tempModule, - &cacheId[0]); - - binaryCreateInfo.apiPsoHash = apiPsoHash; - - // 2. Create pipeline binaries (or load from cache) auto pPipelineCreationFeedbackCreateInfo = extStructs.pPipelineCreationFeedbackCreateInfoEXT; PipelineCompiler::InitPipelineCreationFeedback(pPipelineCreationFeedbackCreateInfo); - if (result == VK_SUCCESS) + if ((result == VK_SUCCESS) && (binariesProvided == false)) { + // 1. Create Cache IDs + result = ComputePipeline::CreateCacheId( + pDevice, + pCreateInfo, + flags, + &shaderInfo, + &binaryCreateInfo, + &shaderOptimizerKey, + &pipelineOptimizerKey, + &apiPsoHash, + &tempModule, + &cacheId[0]); + + binaryCreateInfo.apiPsoHash = apiPsoHash; + + // 2. Create pipeline binaries (or load from cache) + if (result == VK_SUCCESS) + { result = CreatePipelineBinaries( pDevice, pCreateInfo, @@ -381,6 +425,8 @@ VkResult ComputePipeline::Create( cacheId, pipelineBinaries, &binaryMetadata); + } + } CreateInfo localPipelineInfo = {}; @@ -411,9 +457,11 @@ VkResult ComputePipeline::Create( pDevice->PalDevice(DefaultDeviceIndex)->GetComputePipelineSize(localPipelineInfo.pipeline, &palResult); VK_ASSERT(palResult == Pal::Result::Success); + size_t allocationSize = sizeof(ComputePipeline) + (pipelineSize * pDevice->NumPalDevices()); + pSystemMem = pDevice->AllocApiObject( pAllocator, - sizeof(ComputePipeline) + (pipelineSize * pDevice->NumPalDevices())); + allocationSize); if (pSystemMem == nullptr) { @@ -483,6 +531,7 @@ VkResult ComputePipeline::Create( } result = PalToVkResult(palResult); + } if (result == VK_SUCCESS) @@ -525,6 +574,7 @@ VkResult ComputePipeline::Create( } else { + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { // Internal memory allocation failed, free PAL event object if it gets created @@ -541,7 +591,7 @@ VkResult ComputePipeline::Create( // Free the created pipeline binaries now that the PAL Pipelines/PipelineBinaryInfo have read them. for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { - if (pipelineBinaries[deviceIdx].pCode != nullptr) + if ((binariesProvided == false) && (pipelineBinaries[deviceIdx].pCode != nullptr)) { pDevice->GetCompiler(deviceIdx)->FreeComputePipelineBinary( &binaryCreateInfo, pipelineBinaries[deviceIdx]); @@ -608,7 +658,7 @@ VkResult ComputePipeline::Create( // ===================================================================================================================== // Create cacheId for a compute pipeline. VkResult ComputePipeline::CreateCacheId( - Device* pDevice, + const Device* pDevice, const VkComputePipelineCreateInfo* pCreateInfo, VkPipelineCreateFlags2KHR flags, ComputePipelineShaderStageInfo* pShaderInfo, diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index 918d9548..d43cd44f 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -899,6 +899,62 @@ const char* VkResultName( case VkResult::VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT: errName = "VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT"; break; + + case VkResult::VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_THREAD_IDLE_KHR: + errName = "VK_THREAD_IDLE_KHR"; + break; + + case VkResult::VK_THREAD_DONE_KHR: + errName = "VK_THREAD_DONE_KHR"; + break; + case VkResult::VK_OPERATION_DEFERRED_KHR: + errName = "VK_OPERATION_DEFERRED_KHR"; + break; + + case VkResult::VK_OPERATION_NOT_DEFERRED_KHR: + errName = "VK_OPERATION_NOT_DEFERRED_KHR"; + break; + + case VkResult::VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR: + errName = "VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR"; + break; + + case VkResult::VK_ERROR_COMPRESSION_EXHAUSTED_EXT: + errName = "VK_ERROR_COMPRESSION_EXHAUSTED_EXT"; + break; + + case VkResult::VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT: + errName = "VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT"; + break; + + case VkResult::VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT: + errName = "VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT"; + break; + default: VK_NOT_IMPLEMENTED; errName = "??"; diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp index 7fc6172c..567b1495 100644 --- a/icd/api/vk_descriptor_pool.cpp +++ b/icd/api/vk_descriptor_pool.cpp @@ -566,7 +566,9 @@ VkResult DescriptorGpuMemHeap::Init( if (pTypeCount[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) { uint32_t maxSize = 0; - if (pMutableDescriptorTypeCreateInfoEXT != nullptr) + if ((pMutableDescriptorTypeCreateInfoEXT != nullptr) && + (pMutableDescriptorTypeCreateInfoEXT->pMutableDescriptorTypeLists != nullptr) && + (i < pMutableDescriptorTypeCreateInfoEXT->mutableDescriptorTypeListCount)) { const VkMutableDescriptorTypeListEXT& list = pMutableDescriptorTypeCreateInfoEXT->pMutableDescriptorTypeLists[i]; diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 86a3006b..98db2cc7 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -69,6 +69,7 @@ #include "include/graphics_pipeline_common.h" #include "include/vk_graphics_pipeline_library.h" #include "include/internal_layer_hooks.h" +#include "include/vk_indirect_commands_layout.h" #if VKI_RAY_TRACING #include "raytrace/ray_tracing_device.h" @@ -2394,7 +2395,10 @@ VkResult Device::WaitForFences( ppPalFences[i] = Fence::ObjectFromHandle(pFences[i])->PalFence(DefaultDeviceIndex); } - palResult = PalDevice(DefaultDeviceIndex)->WaitForFences(fenceCount, ppPalFences, waitAll != VK_FALSE, timeout); + palResult = PalDevice(DefaultDeviceIndex)->WaitForFences(fenceCount, + ppPalFences, + waitAll != VK_FALSE, + Uint64ToChronoNano(timeout)); } else { @@ -2424,7 +2428,7 @@ VkResult Device::WaitForFences( palResult = PalDevice(deviceIdx)->WaitForFences(perDeviceFenceCount, ppPalFences, waitAll != VK_FALSE, - timeout); + Uint64ToChronoNano(timeout)); } } } @@ -3172,8 +3176,9 @@ VkResult Device::WaitSemaphores( { flags |= Pal::HostWaitFlags::HostWaitAny; } + palResult = PalDevice(DefaultDeviceIndex)->WaitForSemaphores(pWaitInfo->semaphoreCount, ppPalSemaphores, - pWaitInfo->pValues, flags, timeout); + pWaitInfo->pValues, flags, Uint64ToChronoNano(timeout)); return PalToVkResult(palResult); } @@ -3640,6 +3645,15 @@ VkResult Device::AllocBorderColorPalette() return result; } +// ================================================================================================================= +VkResult Device::CreateIndirectCommandsLayout( + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout) +{ + return IndirectCommandsLayout::Create(this, pCreateInfo, pAllocator, pIndirectCommandsLayout); +} + // ===================================================================================================================== void Device::DestroyBorderColorPalette() { @@ -5383,6 +5397,45 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout2KHR( &pSubresource->imageSubresource, &pLayout->subresourceLayout); } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetGeneratedCommandsMemoryRequirementsNV( + VkDevice device, + const VkGeneratedCommandsMemoryRequirementsInfoNV* pInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + const Device* pDevice = ApiDevice::ObjectFromHandle(device); + const IndirectCommandsLayout* pLayout = IndirectCommandsLayout::ObjectFromHandle(pInfo->indirectCommandsLayout); + + pLayout->CalculateMemoryRequirements(pDevice, pMemoryRequirements); +} + +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkCreateIndirectCommandsLayoutNV( + VkDevice device, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout) +{ + Device* pDevice = ApiDevice::ObjectFromHandle(device); + const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + + return pDevice->CreateIndirectCommandsLayout(pCreateInfo, pAllocCB, pIndirectCommandsLayout); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkDestroyIndirectCommandsLayoutNV( + VkDevice device, + VkIndirectCommandsLayoutNV indirectCommandsLayout, + const VkAllocationCallbacks* pAllocator) +{ + if (indirectCommandsLayout != VK_NULL_HANDLE) + { + Device* pDevice = ApiDevice::ObjectFromHandle(device); + const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + + IndirectCommandsLayout::ObjectFromHandle(indirectCommandsLayout)->Destroy(pDevice, pAllocCB); + } +} } // entry @@ -5393,3 +5446,6 @@ template VkPipelineCreateFlags2KHR vk::Device::GetPipelineCreateFlags( const VkRayTracingPipelineCreateInfoKHR* pCreateInfo); #endif +template +VkPipelineCreateFlags2KHR vk::Device::GetPipelineCreateFlags( + const VkComputePipelineCreateInfo* pCreateInfo); diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index 9c8ae471..1f2a2468 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -791,6 +791,17 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkGetShaderModuleIdentifierEXT ); INIT_DISPATCH_ENTRY(vkGetShaderModuleCreateInfoIdentifierEXT ); + INIT_DISPATCH_ENTRY(vkCreateIndirectCommandsLayoutNV ); + INIT_DISPATCH_ENTRY(vkDestroyIndirectCommandsLayoutNV ); + INIT_DISPATCH_ENTRY(vkGetGeneratedCommandsMemoryRequirementsNV ); + INIT_DISPATCH_ENTRY(vkCmdPreprocessGeneratedCommandsNV ); + INIT_DISPATCH_ENTRY(vkCmdExecuteGeneratedCommandsNV ); + INIT_DISPATCH_ENTRY(vkCmdBindPipelineShaderGroupNV ); + + INIT_DISPATCH_ENTRY(vkGetPipelineIndirectDeviceAddressNV ); + INIT_DISPATCH_ENTRY(vkGetPipelineIndirectMemoryRequirementsNV ); + INIT_DISPATCH_ENTRY(vkCmdUpdatePipelineIndirectBufferNV ); + INIT_DISPATCH_ENTRY(vkCmdSetTessellationDomainOriginEXT ); INIT_DISPATCH_ENTRY(vkCmdSetDepthClampEnableEXT ); INIT_DISPATCH_ENTRY(vkCmdSetPolygonModeEXT ); diff --git a/icd/api/vk_formats.cpp b/icd/api/vk_formats.cpp index d593e09b..39cc12b1 100644 --- a/icd/api/vk_formats.cpp +++ b/icd/api/vk_formats.cpp @@ -30,6 +30,7 @@ */ #include "include/vk_formats.h" #include "include/vk_conv.h" +#include "include/vk_physical_device.h" namespace vk { #if ( VKI_GPU_DECOMPRESS) @@ -321,4 +322,344 @@ Pal::Formats::NumericSupportFlags Formats::GetNumberFormat( return numType; } +// ===================================================================================================================== +// Individual planes of multi-planar formats are size-compatible with single-plane color formats if they occupy +// the same number of bits per texel block, and are compatible with those formats if they have the same block extent. +// See 34.1.1 Compatible Formats of Planes of Multi-Planar Formats +VkFormat Formats::GetCompatibleSinglePlaneFormat(VkFormat multiPlaneFormat, uint32_t planeIndex) +{ + VK_ASSERT(GetYuvPlaneCounts(multiPlaneFormat) > 1); + VkFormat singlePlaneFormat = VK_FORMAT_UNDEFINED; + + if (planeIndex < GetYuvPlaneCounts(multiPlaneFormat)) + { + // The conversion below is based on the table in 34.1.1. + // Individual planes of a multi-planar format are in turn format compatible with the listed single plane + // format's Format Compatability Classes (See 34.1.7). + switch (multiPlaneFormat) + { + case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: + case VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM: + case VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM: + singlePlaneFormat = VK_FORMAT_R8_UNORM; + break; + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16: + singlePlaneFormat = VK_FORMAT_R10X6_UNORM_PACK16; + break; + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16: + singlePlaneFormat = VK_FORMAT_R12X4_UNORM_PACK16; + break; + case VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM: + case VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM: + case VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM: + singlePlaneFormat = VK_FORMAT_R16_UNORM; + break; + case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: + case VK_FORMAT_G8_B8R8_2PLANE_422_UNORM: + case VK_FORMAT_G8_B8R8_2PLANE_444_UNORM: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R8_UNORM : + VK_FORMAT_R8G8_UNORM; + break; + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R10X6_UNORM_PACK16 : + VK_FORMAT_R10X6G10X6_UNORM_2PACK16; + break; + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R12X4_UNORM_PACK16 : + VK_FORMAT_R12X4G12X4_UNORM_2PACK16; + break; + case VK_FORMAT_G16_B16R16_2PLANE_420_UNORM: + case VK_FORMAT_G16_B16R16_2PLANE_422_UNORM: + case VK_FORMAT_G16_B16R16_2PLANE_444_UNORM: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R16_UNORM : + VK_FORMAT_R16G16_UNORM; + break; + default: + break; + } + } + + return singlePlaneFormat; +} + +// ===================================================================================================================== +// Computes the extended feature set of a format when VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set +// NOTE: This function assumes the format that is passed in does not have +// Pal::Formats::PropertyFlags::BitCountInaccurate set +VkFormatFeatureFlags Formats::GetExtendedFeatureFlags( + const PhysicalDevice* pPhysicalDevice, + VkFormat format, + VkImageTiling tiling, + const RuntimeSettings& settings) +{ + VkFormatFeatureFlags extendedFeatures = 0; + Pal::SwizzledFormat palFormat = VkToPalFormat(format, settings); + + uint32 bitsPerPixel = Pal::Formats::BitsPerPixel(palFormat.format); + + // The following tables are from the Format Compatibility Classes section of the Vulkan specification. + static constexpr VkFormat Bpp8FormatClass[] = + { + VK_FORMAT_R4G4_UNORM_PACK8, + VK_FORMAT_R8_UNORM, + VK_FORMAT_R8_SNORM, + VK_FORMAT_R8_USCALED, + VK_FORMAT_R8_SSCALED, + VK_FORMAT_R8_UINT, + VK_FORMAT_R8_SINT, + VK_FORMAT_R8_SRGB + }; + + static constexpr VkFormat Bpp16FormatClass[] = + { + VK_FORMAT_R10X6_UNORM_PACK16, + VK_FORMAT_R12X4_UNORM_PACK16, + VK_FORMAT_A4R4G4B4_UNORM_PACK16, + VK_FORMAT_A4B4G4R4_UNORM_PACK16, + VK_FORMAT_R4G4B4A4_UNORM_PACK16, + VK_FORMAT_B4G4R4A4_UNORM_PACK16, + VK_FORMAT_R5G6B5_UNORM_PACK16, + VK_FORMAT_B5G6R5_UNORM_PACK16, + VK_FORMAT_R5G5B5A1_UNORM_PACK16, + VK_FORMAT_B5G5R5A1_UNORM_PACK16, + VK_FORMAT_A1R5G5B5_UNORM_PACK16, + VK_FORMAT_R8G8_UNORM, + VK_FORMAT_R8G8_SNORM, + VK_FORMAT_R8G8_USCALED, + VK_FORMAT_R8G8_SSCALED, + VK_FORMAT_R8G8_UINT, + VK_FORMAT_R8G8_SINT, + VK_FORMAT_R8G8_SRGB, + VK_FORMAT_R16_UNORM, + VK_FORMAT_R16_SNORM, + VK_FORMAT_R16_USCALED, + VK_FORMAT_R16_SSCALED, + VK_FORMAT_R16_UINT, + VK_FORMAT_R16_SINT, + VK_FORMAT_R16_SFLOAT + }; + + static constexpr VkFormat Bpp24FormatClass[] = + { + VK_FORMAT_R8G8B8_UNORM, + VK_FORMAT_R8G8B8_SNORM, + VK_FORMAT_R8G8B8_USCALED, + VK_FORMAT_R8G8B8_SSCALED, + VK_FORMAT_R8G8B8_UINT, + VK_FORMAT_R8G8B8_SINT, + VK_FORMAT_R8G8B8_SRGB, + VK_FORMAT_B8G8R8_UNORM, + VK_FORMAT_B8G8R8_SNORM, + VK_FORMAT_B8G8R8_USCALED, + VK_FORMAT_B8G8R8_SSCALED, + VK_FORMAT_B8G8R8_UINT, + VK_FORMAT_B8G8R8_SINT, + VK_FORMAT_B8G8R8_SRGB + }; + + static constexpr VkFormat Bpp32FormatClass[] = + { + VK_FORMAT_R10X6G10X6_UNORM_2PACK16, + VK_FORMAT_R12X4G12X4_UNORM_2PACK16, + VK_FORMAT_R8G8B8A8_UNORM, + VK_FORMAT_R8G8B8A8_SNORM, + VK_FORMAT_R8G8B8A8_USCALED, + VK_FORMAT_R8G8B8A8_SSCALED, + VK_FORMAT_R8G8B8A8_UINT, + VK_FORMAT_R8G8B8A8_SINT, + VK_FORMAT_R8G8B8A8_SRGB, + VK_FORMAT_B8G8R8A8_UNORM, + VK_FORMAT_B8G8R8A8_SNORM, + VK_FORMAT_B8G8R8A8_USCALED, + VK_FORMAT_B8G8R8A8_SSCALED, + VK_FORMAT_B8G8R8A8_UINT, + VK_FORMAT_B8G8R8A8_SINT, + VK_FORMAT_B8G8R8A8_SRGB, + VK_FORMAT_A8B8G8R8_UNORM_PACK32, + VK_FORMAT_A8B8G8R8_SNORM_PACK32, + VK_FORMAT_A8B8G8R8_USCALED_PACK32, + VK_FORMAT_A8B8G8R8_SSCALED_PACK32, + VK_FORMAT_A8B8G8R8_UINT_PACK32, + VK_FORMAT_A8B8G8R8_SINT_PACK32, + VK_FORMAT_A8B8G8R8_SRGB_PACK32, + VK_FORMAT_A2R10G10B10_UNORM_PACK32, + VK_FORMAT_A2R10G10B10_SNORM_PACK32, + VK_FORMAT_A2R10G10B10_USCALED_PACK32, + VK_FORMAT_A2R10G10B10_SSCALED_PACK32, + VK_FORMAT_A2R10G10B10_UINT_PACK32, + VK_FORMAT_A2R10G10B10_SINT_PACK32, + VK_FORMAT_A2B10G10R10_UNORM_PACK32, + VK_FORMAT_A2B10G10R10_SNORM_PACK32, + VK_FORMAT_A2B10G10R10_USCALED_PACK32, + VK_FORMAT_A2B10G10R10_SSCALED_PACK32, + VK_FORMAT_A2B10G10R10_UINT_PACK32, + VK_FORMAT_A2B10G10R10_SINT_PACK32, + VK_FORMAT_R16G16_UNORM, + VK_FORMAT_R16G16_SNORM, + VK_FORMAT_R16G16_USCALED, + VK_FORMAT_R16G16_SSCALED, + VK_FORMAT_R16G16_UINT, + VK_FORMAT_R16G16_SINT, + VK_FORMAT_R16G16_SFLOAT, + VK_FORMAT_R32_UINT, + VK_FORMAT_R32_SINT, + VK_FORMAT_R32_SFLOAT, + VK_FORMAT_B10G11R11_UFLOAT_PACK32, + VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 + }; + + static constexpr VkFormat Bpp48FormatClass[] = + { + VK_FORMAT_R16G16B16_UNORM, + VK_FORMAT_R16G16B16_SNORM, + VK_FORMAT_R16G16B16_USCALED, + VK_FORMAT_R16G16B16_SSCALED, + VK_FORMAT_R16G16B16_UINT, + VK_FORMAT_R16G16B16_SINT, + VK_FORMAT_R16G16B16_SFLOAT + }; + + static constexpr VkFormat Bpp64FormatClass[] = + { + VK_FORMAT_R16G16B16A16_UNORM, + VK_FORMAT_R16G16B16A16_SNORM, + VK_FORMAT_R16G16B16A16_USCALED, + VK_FORMAT_R16G16B16A16_SSCALED, + VK_FORMAT_R16G16B16A16_UINT, + VK_FORMAT_R16G16B16A16_SINT, + VK_FORMAT_R16G16B16A16_SFLOAT, + VK_FORMAT_R32G32_UINT, + VK_FORMAT_R32G32_SINT, + VK_FORMAT_R32G32_SFLOAT, + VK_FORMAT_R64_UINT, + VK_FORMAT_R64_SINT, + VK_FORMAT_R64_SFLOAT + }; + + static constexpr VkFormat Bpp96FormatClass[] = + { + VK_FORMAT_R32G32B32_UINT, + VK_FORMAT_R32G32B32_SINT, + VK_FORMAT_R32G32B32_SFLOAT + }; + + static constexpr VkFormat Bpp128FormatClass[] = + { + VK_FORMAT_R32G32B32A32_UINT, + VK_FORMAT_R32G32B32A32_SINT, + VK_FORMAT_R32G32B32A32_SFLOAT, + VK_FORMAT_R64G64_UINT, + VK_FORMAT_R64G64_SINT, + VK_FORMAT_R64G64_SFLOAT + }; + + static constexpr VkFormat Bpp192FormatClass[] = + { + VK_FORMAT_R64G64B64_UINT, + VK_FORMAT_R64G64B64_SINT, + VK_FORMAT_R64G64B64_SFLOAT + }; + + static constexpr VkFormat Bpp256FormatClass[] = + { + VK_FORMAT_R64G64B64A64_UINT, + VK_FORMAT_R64G64B64A64_SINT, + VK_FORMAT_R64G64B64A64_SFLOAT + }; + + // Depth images have no extended usage. + // YUV single and multiplanar images by themselves have no extended usage. To compute extended usage + // of a single plane of a multiplanar image call GetCompatibleSinglePlaneFormat and pass that format in. + // BC images allow conversion between UNORM|SRGB but there shouldn't be any difference in features. + bool noCompatibleExtendedUsage = Formats::IsDepthStencilFormat(format) || + Formats::IsYuvFormat(format) || + Pal::Formats::IsBlockCompressed(palFormat.format) || + (format == VK_FORMAT_UNDEFINED); + + if (noCompatibleExtendedUsage == false) + { + const VkFormat* pExtendedFormats = nullptr; + uint32_t extendedFormatCount = 0; + + switch (bitsPerPixel) + { + case 8: + pExtendedFormats = Bpp8FormatClass; + extendedFormatCount = sizeof(Bpp8FormatClass) / sizeof(VkFormat); + break; + case 16: + pExtendedFormats = Bpp16FormatClass; + extendedFormatCount = sizeof(Bpp16FormatClass) / sizeof(VkFormat); + break; + case 24: + pExtendedFormats = Bpp24FormatClass; + extendedFormatCount = sizeof(Bpp24FormatClass) / sizeof(VkFormat); + break; + case 32: + pExtendedFormats = Bpp32FormatClass; + extendedFormatCount = sizeof(Bpp32FormatClass) / sizeof(VkFormat); + break; + case 48: + pExtendedFormats = Bpp48FormatClass; + extendedFormatCount = sizeof(Bpp48FormatClass) / sizeof(VkFormat); + break; + case 64: + pExtendedFormats = Bpp64FormatClass; + extendedFormatCount = sizeof(Bpp64FormatClass) / sizeof(VkFormat); + break; + case 96: + pExtendedFormats = Bpp96FormatClass; + extendedFormatCount = sizeof(Bpp96FormatClass) / sizeof(VkFormat); + break; + case 128: + pExtendedFormats = Bpp128FormatClass; + extendedFormatCount = sizeof(Bpp128FormatClass) / sizeof(VkFormat); + break; + case 192: + pExtendedFormats = Bpp192FormatClass; + extendedFormatCount = sizeof(Bpp192FormatClass) / sizeof(VkFormat); + break; + case 256: + pExtendedFormats = Bpp256FormatClass; + extendedFormatCount = sizeof(Bpp256FormatClass) / sizeof(VkFormat); + break; + default: + VK_ALERT_ALWAYS_MSG("Unknown Format Class"); + } + + if ((extendedFormatCount > 0 && pExtendedFormats != nullptr)) + { + for (uint32_t i = 0; i < extendedFormatCount; ++i) + { + VkFormat extendedFormat = pExtendedFormats[i]; + + VkFormatProperties extendedFormatProperties = {}; + + VkResult result = pPhysicalDevice->GetFormatProperties(extendedFormat, &extendedFormatProperties); + if (result != VK_ERROR_FORMAT_NOT_SUPPORTED) + { + extendedFeatures |= (tiling == VK_IMAGE_TILING_OPTIMAL) ? + extendedFormatProperties.optimalTilingFeatures : + extendedFormatProperties.linearTilingFeatures; + } + + } + } + } + + return extendedFeatures; +} + } diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 08595c79..5fc1f618 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -86,8 +86,7 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( if (shouldCompile) { - bool skipCacheQuery = pDevice->GetRuntimeSettings().enablePipelineDump; - + bool skipCacheQuery = false; if (skipCacheQuery == false) { // Search the pipeline binary cache @@ -105,26 +104,71 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( } } - // Compile if unable to retrieve from cache if (shouldCompile) { - if ((deviceIdx == DefaultDeviceIndex) || (pCreateInfo == nullptr)) + if ((pDevice->GetRuntimeSettings().ignoreFlagFailOnPipelineCompileRequired == false) && + (flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)) { - if (pCreateInfo != nullptr) - { - result = pDefaultCompiler->ConvertGraphicsPipelineInfo( - pDevice, - pCreateInfo, - extStructs, - flags, - pShaderInfo, - pPipelineLayout, - pPipelineOptimizerKey, - pBinaryMetadata, - pBinaryCreateInfo); - } + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + } + + bool shouldConvert = (pCreateInfo != nullptr) && + (pDevice->GetRuntimeSettings().enablePipelineDump || + (shouldCompile && (deviceIdx == DefaultDeviceIndex))); + + VkResult convertResult = VK_ERROR_UNKNOWN; + if (shouldConvert) + { + convertResult = pDefaultCompiler->ConvertGraphicsPipelineInfo( + pDevice, + pCreateInfo, + extStructs, + flags, + pShaderInfo, + pPipelineLayout, + pPipelineOptimizerKey, + pBinaryMetadata, + pBinaryCreateInfo); + result = (result == VK_SUCCESS) ? convertResult : result; + } + + if ((result == VK_SUCCESS) && (convertResult == VK_SUCCESS) && shouldCompile) + { + if (IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.vs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.gs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.tcs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.tes) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.fs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.task) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.mesh)) + { + result = VK_ERROR_UNKNOWN; + } + } + + if (pDevice->GetRuntimeSettings().enablePipelineDump && (convertResult == VK_SUCCESS)) + { + if ((shouldCompile == false) || (result != VK_SUCCESS)) + { + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pipelineInfo.pGraphicsInfo = &pBinaryCreateInfo->pipelineInfo; + pDefaultCompiler->DumpPipeline( + pDevice->GetRuntimeSettings(), + pipelineInfo, + pBinaryCreateInfo->apiPsoHash, + 1, + &pPipelineBinaries[deviceIdx], + result); + } + } - if (result == VK_SUCCESS) + // Compile if unable to retrieve from cache + if (shouldCompile) + { + if (result == VK_SUCCESS) + { + if ((deviceIdx == DefaultDeviceIndex) || (pCreateInfo == nullptr)) { result = pDefaultCompiler->CreateGraphicsPipelineBinary( pDevice, @@ -145,45 +189,45 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( pBinaryCreateInfo->pBinaryMetadata); } } - } - else - { - GraphicsPipelineBinaryCreateInfo binaryCreateInfoMGPU = {}; - PipelineMetadata binaryMetadataMGPU = {}; - result = pDefaultCompiler->ConvertGraphicsPipelineInfo( - pDevice, - pCreateInfo, - extStructs, - flags, - pShaderInfo, - pPipelineLayout, - pPipelineOptimizerKey, - &binaryMetadataMGPU, - &binaryCreateInfoMGPU); - - if (result == VK_SUCCESS) + else { - result = pDevice->GetCompiler(deviceIdx)->CreateGraphicsPipelineBinary( + GraphicsPipelineBinaryCreateInfo binaryCreateInfoMGPU = {}; + PipelineMetadata binaryMetadataMGPU = {}; + result = pDefaultCompiler->ConvertGraphicsPipelineInfo( pDevice, - deviceIdx, - pPipelineCache, - &binaryCreateInfoMGPU, + pCreateInfo, + extStructs, flags, - &pPipelineBinaries[deviceIdx], - &pCacheIds[deviceIdx]); - } + pShaderInfo, + pPipelineLayout, + pPipelineOptimizerKey, + &binaryMetadataMGPU, + &binaryCreateInfoMGPU); - if (result == VK_SUCCESS) - { - result = PipelineCompiler::SetPipelineCreationFeedbackInfo( - pCreationFeedbackInfo, - pCreateInfo->stageCount, - pCreateInfo->pStages, - &binaryCreateInfoMGPU.pipelineFeedback, - binaryCreateInfoMGPU.stageFeedback); - } + if (result == VK_SUCCESS) + { + result = pDevice->GetCompiler(deviceIdx)->CreateGraphicsPipelineBinary( + pDevice, + deviceIdx, + pPipelineCache, + &binaryCreateInfoMGPU, + flags, + &pPipelineBinaries[deviceIdx], + &pCacheIds[deviceIdx]); + } + + if (result == VK_SUCCESS) + { + result = PipelineCompiler::SetPipelineCreationFeedbackInfo( + pCreationFeedbackInfo, + pCreateInfo->stageCount, + pCreateInfo->pStages, + &binaryCreateInfoMGPU.pipelineFeedback, + binaryCreateInfoMGPU.stageFeedback); + } - pDefaultCompiler->FreeGraphicsPipelineCreateInfo(pDevice, &binaryCreateInfoMGPU, false, false); + pDefaultCompiler->FreeGraphicsPipelineCreateInfo(pDevice, &binaryCreateInfoMGPU, false, false); + } } } else if (deviceIdx == DefaultDeviceIndex) @@ -545,16 +589,17 @@ static bool IsGplFastLinkPossible( // ===================================================================================================================== void DumpGplFastLinkInfo( - const Device* pDevice, - VkPipeline pipeline, - GraphicsPipelineBinaryCreateInfo* pCreateInfo) + const Device* pDevice, + VkPipeline pipeline, + const GraphicsPipelineBinaryCreateInfo& createInfo, + const GraphicsPipelineLibraryInfo& libInfo) { const GraphicsPipeline* pGraphicsPipeline = GraphicsPipeline::ObjectFromHandle(pipeline); const Pal::IPipeline* pPalPipeline = pGraphicsPipeline->GetPalPipeline(DefaultDeviceIndex); const Pal::PipelineInfo info = pPalPipeline->GetInfo(); const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : info.internalPipelineHash.stable; + uint64_t dumpHash = settings.dumpPipelineWithApiHash ? createInfo.apiPsoHash : info.internalPipelineHash.stable; Vkgc::PipelineDumpOptions dumpOptions = {}; dumpOptions.pDumpDir = settings.pipelineDumpDir; @@ -563,49 +608,66 @@ void DumpGplFastLinkInfo( dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; Vkgc::PipelineBuildInfo pipelineInfo = {}; - pCreateInfo->pipelineInfo.unlinked = false; - pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo; + pipelineInfo.pGraphicsInfo = &createInfo.pipelineInfo; void* pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, dumpHash); if (pPipelineDumpHandle != nullptr) { - char extraInfo[256] = {}; + char preRasterFileName[Util::MaxFileNameStrLen] = {}; + char fragmentFileName[Util::MaxFileNameStrLen] = {}; + char colorExportFileName[Util::MaxFileNameStrLen] = {}; + + const GraphicsPipelineBinaryCreateInfo& preRasterCreateInfo = + libInfo.pPreRasterizationShaderLib->GetPipelineBinaryCreateInfo(); + const GraphicsPipelineBinaryCreateInfo& fragmentCreateInfo = + libInfo.pFragmentShaderLib->GetPipelineBinaryCreateInfo(); + + uint64_t preRasterHash = settings.dumpPipelineWithApiHash ? + preRasterCreateInfo.apiPsoHash : preRasterCreateInfo.libraryHash[GraphicsLibraryPreRaster]; + uint64_t fragmentHash = settings.dumpPipelineWithApiHash ? + fragmentCreateInfo.apiPsoHash : fragmentCreateInfo.libraryHash[GraphicsLibraryFragment]; + + Vkgc::IPipelineDumper::GetPipelineName(&preRasterCreateInfo.pipelineInfo, + preRasterFileName, Util::MaxFileNameStrLen, preRasterHash); + Vkgc::IPipelineDumper::GetPipelineName(&fragmentCreateInfo.pipelineInfo, + fragmentFileName, Util::MaxFileNameStrLen, fragmentHash); + + if (createInfo.pipelineInfo.enableColorExportShader) + { + uint64_t colorExportHash = settings.dumpPipelineWithApiHash ? + createInfo.apiPsoHash : createInfo.libraryHash[GraphicsLibraryColorExport]; + Vkgc::GraphicsPipelineBuildInfo colorExportInfo = {}; + colorExportInfo.unlinked = true; + Vkgc::IPipelineDumper::GetPipelineName(&colorExportInfo, + colorExportFileName, Util::MaxFileNameStrLen, colorExportHash); + } + + const char* fileNames[] = {preRasterFileName, fragmentFileName, colorExportFileName}; + Vkgc::IPipelineDumper::DumpGraphicsLibraryFileName(pPipelineDumpHandle, fileNames); + + char extraInfo[256] = {}; Util::Snprintf( extraInfo, sizeof(extraInfo), - "; ApiPsoHash: 0x%016" PRIX64 "\n", - pCreateInfo->apiPsoHash); + "\n; ApiPsoHash: 0x%016" PRIX64 "\n", + createInfo.apiPsoHash); Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, extraInfo); - for (uint32_t i = 0; i < GraphicsLibraryCount; i++) - { - if (pCreateInfo->pShaderLibraries[i] == nullptr) - { - continue; - } - const Pal::LibraryInfo& libInfo = pCreateInfo->pShaderLibraries[i]->GetInfo(); - Util::Snprintf( - extraInfo, - sizeof(extraInfo), - "; GraphicsPipelineLibrary Hash: 0x%016" PRIX64 "\n", - libInfo.internalLibraryHash.stable); - Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, extraInfo); - } for (uint32_t i = 0; i < GraphicsLibraryCount; i++) { - if (pCreateInfo->pShaderLibraries[i] == nullptr) + if (createInfo.pShaderLibraries[i] == nullptr) { continue; } uint32_t codeSize = 0; - Pal::Result result = pCreateInfo->pShaderLibraries[i]->GetCodeObject(&codeSize, nullptr); + Pal::Result result = createInfo.pShaderLibraries[i]->GetCodeObject(&codeSize, nullptr); if ((codeSize > 0) && (result == Pal::Result::Success)) { void* pCode = pDevice->VkInstance()->AllocMem(codeSize, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (pCode != nullptr) { - result = pCreateInfo->pShaderLibraries[i]->GetCodeObject(&codeSize, pCode); + result = createInfo.pShaderLibraries[i]->GetCodeObject(&codeSize, pCode); VK_ASSERT(result == Pal::Result::Success); Vkgc::BinaryData libraryBinary = {}; @@ -619,8 +681,9 @@ void DumpGplFastLinkInfo( } } - PipelineCompiler::DumpPipelineMetadata(pPipelineDumpHandle, pCreateInfo->pBinaryMetadata); + PipelineCompiler::DumpPipelineMetadata(pPipelineDumpHandle, createInfo.pBinaryMetadata); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, "\n;CompileResult=FastLinkSuccess\n"); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } } @@ -650,7 +713,7 @@ VkResult GraphicsPipeline::Create( GraphicsPipelineObjectCreateInfo objectCreateInfo = {}; GraphicsPipelineShaderStageInfo shaderStageInfo = {}; PipelineOptimizerKey pipelineOptimizerKey = {}; - uint64_t apiPsoHash = {}; + uint64_t apiPsoHash = 0; Util::MetroHash::Hash elfHash = {}; PipelineMetadata binaryMetadata = {}; PipelineLayout* pPipelineLayout = nullptr; @@ -671,7 +734,6 @@ VkResult GraphicsPipeline::Create( // 1. Check whether GPL fast link is possible if (pDevice->GetRuntimeSettings().useShaderLibraryForPipelineLibraryFastLink) { - // If pipeline only contains PreRasterizationShaderLib and no fragment shader is in the create info, // we add a null fragment library in order to use fast link. if ((libInfo.flags.isLibrary == false) && @@ -763,7 +825,6 @@ VkResult GraphicsPipeline::Create( &shaderOptimizerKeys[0], &pipelineOptimizerKey, &apiPsoHash, - //&elfHash, &tempModules[0], &cacheId[0]); @@ -921,7 +982,7 @@ VkResult GraphicsPipeline::Create( if (enableFastLink && pDevice->GetRuntimeSettings().enablePipelineDump) { - DumpGplFastLinkInfo(pDevice, *pPipeline, &binaryCreateInfo); + DumpGplFastLinkInfo(pDevice, *pPipeline, binaryCreateInfo, libInfo); } } @@ -1599,7 +1660,7 @@ VkResult GraphicsPipeline::Destroy( { if (m_deferWorkload.pEvent != nullptr) { - auto result = m_deferWorkload.pEvent->Wait(10); + auto result = m_deferWorkload.pEvent->Wait(Util::fseconds{ 10 }); if (result == Util::Result::Success) { Util::Destructor(m_deferWorkload.pEvent); diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp index 55bb7568..8ce4eb75 100644 --- a/icd/api/vk_graphics_pipeline_library.cpp +++ b/icd/api/vk_graphics_pipeline_library.cpp @@ -352,17 +352,26 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( continue; } + if ((GetVkGraphicsLibraryFlagBit(pShaderStageInfo->stages[i].stage) ^ pLibInfo->libFlags) != 0) + { + continue; + } + if (canBuildShader) { - // We don't take care of the result. Early compile failure in some cases is expected - pCompiler->CreateGraphicsShaderBinary( + result = pCompiler->CreateGraphicsShaderBinary( pDevice, pPipelineCache, gplType, pBinaryCreateInfo, &pTempModuleStages[i]); gplMask |= (1 << gplType); } + + if (result != VK_SUCCESS) + { + break; + } } } - if (pLibInfo->flags.optimize) + if ((result == VK_SUCCESS) && pLibInfo->flags.optimize) { // We need to re-compile some stage if related new state is available if ((pLibInfo->libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) && @@ -405,17 +414,20 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( palElfBinary = pCompiler->GetSolution(pBinaryCreateInfo->compilerType)-> ExtractPalElfBinary(pBinaryCreateInfo->earlyElfPackage[gplType]); - result = pCompiler->CreateGraphicsShaderLibrary(pDevice, - palElfBinary, - pAllocator, - &pBinaryCreateInfo->pShaderLibraries[gplType]); - pBinaryCreateInfo->earlyElfPackage[gplType].pCode = nullptr; - } - - if (pTempModuleStages[stage].elfPackage.codeSize > 0) - { - pDevice->VkInstance()->FreeMem(const_cast(pTempModuleStages[stage].elfPackage.pCode)); - pTempModuleStages[stage].elfPackage = {}; + if (palElfBinary.codeSize > 0) + { + result = pCompiler->CreateGraphicsShaderLibrary(pDevice, + palElfBinary, + pAllocator, + &pBinaryCreateInfo->pShaderLibraries[gplType]); + pBinaryCreateInfo->earlyElfPackage[gplType].pCode = nullptr; + + if (pTempModuleStages[stage].elfPackage.codeSize > 0) + { + pDevice->VkInstance()->FreeMem(const_cast(pTempModuleStages[stage].elfPackage.pCode)); + pTempModuleStages[stage].elfPackage = {}; + } + } } } @@ -693,11 +705,11 @@ GraphicsPipelineLibrary::GraphicsPipelineLibrary( const uint64_t apiHash, const GplModuleState* pGplModuleStates, const PipelineLayout* pPipelineLayout) + : GraphicsPipelineCommon( #if VKI_RAY_TRACING - : GraphicsPipelineCommon(false, pDevice), -#else - : GraphicsPipelineCommon(pDevice), + false, #endif + pDevice), m_objectCreateInfo(objectInfo), m_pBinaryCreateInfo(pBinaryInfo), m_libInfo(libInfo), diff --git a/icd/api/vk_indirect_commands_layout.cpp b/icd/api/vk_indirect_commands_layout.cpp new file mode 100644 index 00000000..0a84dd44 --- /dev/null +++ b/icd/api/vk_indirect_commands_layout.cpp @@ -0,0 +1,382 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file vk_indirect_commands_layout.cpp + * @brief Contains implementation of Vulkan indirect commands layout objects. + *********************************************************************************************************************** + */ + +#include "include/vk_indirect_commands_layout.h" +#include "include/vk_buffer.h" +#include "include/vk_conv.h" + +namespace vk +{ +// ===================================================================================================================== +// Creates an indirect commands layout object. +VkResult IndirectCommandsLayout::Create( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pLayout) +{ + VkResult result = VK_SUCCESS; + Pal::Result palResult; + + Pal::IndirectCmdGeneratorCreateInfo createInfo = {}; + Pal::IndirectParam indirectParams[MaxIndirectTokenCount] = {}; + createInfo.pParams = &indirectParams[0]; + + Pal::IIndirectCmdGenerator* pGenerators[MaxPalDevices] = {}; + + const size_t apiSize = ObjectSize(pDevice); + size_t totalSize = apiSize; + size_t palSize = 0; + + void* pMemory = nullptr; + + IndirectCommandsInfo info = {}; + + VK_ASSERT(pCreateInfo->streamCount == 1); + VK_ASSERT(pCreateInfo->tokenCount > 0); + VK_ASSERT(pCreateInfo->tokenCount <= MaxIndirectTokenCount); + + if (pCreateInfo->tokenCount == 1) + { + VK_NOT_IMPLEMENTED; + } + + const VkIndirectCommandsLayoutTokenNV lastToken = pCreateInfo->pTokens[pCreateInfo->tokenCount - 1]; + + switch (lastToken.tokenType) + { + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV: + info.actionType = IndirectCommandsActionType::Draw; + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV: + info.actionType = IndirectCommandsActionType::DrawIndexed; + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV: + info.actionType = IndirectCommandsActionType::Dispatch; + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV: + info.actionType = IndirectCommandsActionType::MeshTask; + break; + + default: + VK_ALERT_ALWAYS_MSG("Indirect tokens can only end up with one type of actions."); + result = VK_ERROR_UNKNOWN; + break; + } + + if (result == VK_SUCCESS) + { + BuildPalCreateInfo(pDevice, pCreateInfo, &indirectParams[0], &createInfo); + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + const size_t size = pDevice->PalDevice(deviceIdx)->GetIndirectCmdGeneratorSize(createInfo, + &palResult); + if (palResult == Pal::Result::Success) + { + palSize += size; + } + else + { + result = PalToVkResult(palResult); + break; + } + } + + totalSize += palSize; + } + + if (result == VK_SUCCESS) + { + pMemory = pDevice->AllocApiObject(pAllocator, totalSize); + if (pMemory == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + + if (result == VK_SUCCESS) + { + void* pPalMemory = Util::VoidPtrInc(pMemory, apiSize); + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + const size_t size = pDevice->PalDevice(deviceIdx)->GetIndirectCmdGeneratorSize(createInfo, + &palResult); + + if (palResult == Pal::Result::Success) + { + palResult = pDevice->PalDevice(deviceIdx)->CreateIndirectCmdGenerator(createInfo, + pPalMemory, + &pGenerators[deviceIdx]); + } + + if (palResult == Pal::Result::Success) + { + pPalMemory = Util::VoidPtrInc(pPalMemory, size); + } + else + { + result = PalToVkResult(palResult); + break; + } + } + } + + if (result == VK_SUCCESS) + { + VK_PLACEMENT_NEW(pMemory) IndirectCommandsLayout( + pDevice, + info, + pGenerators, + createInfo); + + *pLayout = IndirectCommandsLayout::HandleFromVoidPointer(pMemory); + } + + return result; +} + +// ===================================================================================================================== +IndirectCommandsLayout::IndirectCommandsLayout( + const Device* pDevice, + const IndirectCommandsInfo& info, + Pal::IIndirectCmdGenerator** pPalGenerator, + const Pal::IndirectCmdGeneratorCreateInfo& palCreateInfo) + : + m_info(info), + m_palCreateInfo(palCreateInfo) +{ + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + m_perGpu[deviceIdx].pGenerator = pPalGenerator[deviceIdx]; + m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + } +} + +// ===================================================================================================================== +void IndirectCommandsLayout::BuildPalCreateInfo( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + Pal::IndirectParam* pIndirectParams, + Pal::IndirectCmdGeneratorCreateInfo* pPalCreateInfo) +{ + uint32_t bindingArgsSize = 0; + + const bool isDispatch = (pCreateInfo->pTokens[pCreateInfo->tokenCount - 1].tokenType + == VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV); + + for (uint32_t i = 0; i < pCreateInfo->tokenCount; ++i) + { + const VkIndirectCommandsLayoutTokenNV& token = pCreateInfo->pTokens[i]; + + switch (token.tokenType) + { + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV: + pIndirectParams[i].type = Pal::IndirectParamType::Draw; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DrawIndirectArgs); + static_assert(sizeof(Pal::DrawIndirectArgs) == sizeof(VkDrawIndirectCommand)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV: + pIndirectParams[i].type = Pal::IndirectParamType::DrawIndexed; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DrawIndexedIndirectArgs); + static_assert(sizeof(Pal::DrawIndexedIndirectArgs) == sizeof(VkDrawIndexedIndirectCommand)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV: + pIndirectParams[i].type = Pal::IndirectParamType::Dispatch; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DispatchIndirectArgs); + static_assert(sizeof(Pal::DispatchIndirectArgs) == sizeof(VkDispatchIndirectCommand)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV: + pIndirectParams[i].type = Pal::IndirectParamType::BindIndexData; + pIndirectParams[i].sizeInBytes = sizeof(Pal::BindIndexDataIndirectArgs); + static_assert(sizeof(Pal::BindIndexDataIndirectArgs) == sizeof(VkBindIndexBufferIndirectCommandNV)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV: + pIndirectParams[i].type = Pal::IndirectParamType::BindVertexData; + pIndirectParams[i].sizeInBytes = sizeof(Pal::BindVertexDataIndirectArgs); + pIndirectParams[i].vertexData.bufferId = token.vertexBindingUnit; + pIndirectParams[i].userDataShaderUsage = Pal::ApiShaderStageVertex; + static_assert(sizeof(Pal::BindVertexDataIndirectArgs) == sizeof(VkBindVertexBufferIndirectCommandNV)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV: + pIndirectParams[i].type = Pal::IndirectParamType::DispatchMesh; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DispatchMeshIndirectArgs); + static_assert(sizeof(Pal::DispatchMeshIndirectArgs) == sizeof(VkDrawMeshTasksIndirectCommandEXT)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV: + { + const PipelineLayout* pPipelineLayout = PipelineLayout::ObjectFromHandle(token.pushconstantPipelineLayout); + const UserDataLayout& userDataLayout = pPipelineLayout->GetInfo().userDataLayout; + + if (userDataLayout.scheme == PipelineLayoutScheme::Indirect) + { + VK_NOT_IMPLEMENTED; + } + + uint32_t startInDwords = token.pushconstantOffset / sizeof(uint32_t); + uint32_t lengthInDwords = PipelineLayout::GetPushConstantSizeInDword(token.pushconstantSize); + + pIndirectParams[i].type = Pal::IndirectParamType::SetUserData; + pIndirectParams[i].userData.entryCount = lengthInDwords; + pIndirectParams[i].sizeInBytes = sizeof(uint32_t) * lengthInDwords; + pIndirectParams[i].userData.firstEntry = userDataLayout.compact.pushConstRegBase + startInDwords; + pIndirectParams[i].userDataShaderUsage = VkToPalShaderStageMask(token.pushconstantShaderStageFlags); + break; + } + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SHADER_GROUP_NV: + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV: + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_TASKS_NV: + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV: + VK_NOT_IMPLEMENTED; + break; + + default: + VK_NEVER_CALLED(); + break; + + } + + if (i < (pCreateInfo->tokenCount - 1)) + { + bindingArgsSize += pIndirectParams[i].sizeInBytes; + } + } + + for (uint32_t i = 0; i < pCreateInfo->streamCount; ++i) + { + uint32_t stride = pCreateInfo->pStreamStrides[i]; + pPalCreateInfo->strideInBytes += stride; + } + + pPalCreateInfo->paramCount = pCreateInfo->tokenCount; + + // Override userDataShaderUsage to compute shader only for dispatch type + if (isDispatch) + { + for (uint32_t i = 0; i < pPalCreateInfo->paramCount; ++i) + { + pIndirectParams[i].userDataShaderUsage = Pal::ShaderStageFlagBits::ApiShaderStageCompute; + } + } +} + +// ===================================================================================================================== +void IndirectCommandsLayout::CalculateMemoryRequirements( + const Device* pDevice, + VkMemoryRequirements2* pMemoryRequirements + ) const +{ + VK_ASSERT(m_perGpu[DefaultDeviceIndex].pGenerator != nullptr); + Pal::GpuMemoryRequirements memReqs = {}; + m_perGpu[DefaultDeviceIndex].pGenerator->GetGpuMemoryRequirements(&memReqs); + +#if DEBUG + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + VK_ASSERT(m_perGpu[deviceIdx].pGenerator != nullptr); + + if (deviceIdx != DefaultDeviceIndex) + { + Pal::GpuMemoryRequirements deviceReqs = {}; + m_perGpu[deviceIdx].pGenerator->GetGpuMemoryRequirements(&deviceReqs); + VK_ASSERT(memcmp(&memReqs, &deviceReqs, sizeof(deviceReqs)) == 0); + } + } +#endif + + pMemoryRequirements->memoryRequirements.alignment = memReqs.alignment; + pMemoryRequirements->memoryRequirements.size = memReqs.size; + + pMemoryRequirements->memoryRequirements.memoryTypeBits = 0; + + for (uint32_t i = 0; i < memReqs.heapCount; ++i) + { + uint32_t typeIndexBits; + + if (pDevice->GetVkTypeIndexBitsFromPalHeap(memReqs.heaps[i], &typeIndexBits)) + { + pMemoryRequirements->memoryRequirements.memoryTypeBits |= typeIndexBits; + } + } +} + +// ===================================================================================================================== +void IndirectCommandsLayout::BindPreprocessBuffer( + VkBuffer buffer, + VkDeviceSize memOffset, + uint32_t deviceIdx) +{ + Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); + Pal::gpusize bufferVirtAddr = pBuffer->PalMemory(deviceIdx)->Desc().gpuVirtAddr + memOffset; + + if (m_perGpu[deviceIdx].preprocessBufferVirtAddr != bufferVirtAddr) + { + Pal::Result palResult = m_perGpu[deviceIdx].pGenerator->BindGpuMemory(pBuffer->PalMemory(deviceIdx), + memOffset); + VK_ASSERT(palResult == Pal::Result::Success); + m_perGpu[deviceIdx].preprocessBufferVirtAddr = bufferVirtAddr; + } +} + +// ===================================================================================================================== +VkResult IndirectCommandsLayout::Destroy( + Device* pDevice, + const VkAllocationCallbacks* pAllocator) +{ + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + if (m_perGpu[deviceIdx].pGenerator != nullptr) + { + m_perGpu[deviceIdx].pGenerator->Destroy(); + } + // It's app's reponsibility to free the preprocess buffer. + m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + } + + Util::Destructor(this); + + pDevice->FreeApiObject(pAllocator, this); + + return VK_SUCCESS; +} + +} // namespace vk diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp index cca1b3cc..98b14635 100644 --- a/icd/api/vk_instance.cpp +++ b/icd/api/vk_instance.cpp @@ -46,6 +46,8 @@ #if ICD_GPUOPEN_DEVMODE_BUILD #include "devmode/devmode_mgr.h" +#include "devmode/devmode_rgp.h" +#include "devmode/devmode_ubertrace.h" #endif #include "res/ver.h" @@ -90,7 +92,7 @@ Instance::Instance( m_preInitAppProfile(preInitProfile), m_screenCount(0), m_pScreenStorage(nullptr), - m_pDevModeMgr(nullptr), + m_pDevMode(nullptr), m_debugReportCallbacks(&m_palAllocator), m_debugUtilsMessengers(&m_palAllocator), m_logTagIdMask(0), @@ -439,7 +441,7 @@ VkResult Instance::Init( // Late-initialize the developer mode manager. Needs to be called after settings are committed but BEFORE // physical devices are late-initialized (below). - if ((status == VK_SUCCESS) && (m_pDevModeMgr != nullptr)) + if ((status == VK_SUCCESS) && (m_pDevMode != nullptr)) { DevModeLateInitialize(); } @@ -549,12 +551,10 @@ VkResult Instance::Init( InitDispatchTable(); -#if DEBUG // Optionally wait for a debugger to be attached utils::WaitIdleForDebugger(pPhysicalDevice->GetRuntimeSettings().waitForDebugger, &pPhysicalDevice->GetRuntimeSettings().waitForDebuggerExecutableName[0], pPhysicalDevice->GetRuntimeSettings().debugTimeout); -#endif } return status; @@ -630,9 +630,9 @@ VkResult Instance::LoadAndCommitSettings( #if ICD_GPUOPEN_DEVMODE_BUILD // Inform developer mode manager of settings. This also finalizes the developer mode manager. - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_pDevModeMgr->Finalize(deviceCount, settingsLoaders); + m_pDevMode->Finalize(deviceCount, settingsLoaders); } #endif @@ -671,8 +671,8 @@ VkResult Instance::Destroy(void) AmdvlkLog(m_logTagIdMask, GeneralPrint, "%s End ********\n", GetApplicationName()); #if ICD_GPUOPEN_DEVMODE_BUILD - // Pipeline binary cache is required to be freed before destroying DevModeMgr - // because DevModeMgr manages the state of pipeline binary cache. + // Pipeline binary cache is required to be freed before destroying DevMode + // because DevMode manages the state of pipeline binary cache. uint32_t deviceCount = PhysicalDeviceManager::MaxPhysicalDevices; VkPhysicalDevice devices[PhysicalDeviceManager::MaxPhysicalDevices] = {}; m_pPhysicalDeviceManager->EnumeratePhysicalDevices(&deviceCount, devices); @@ -681,9 +681,9 @@ VkResult Instance::Destroy(void) ApiPhysicalDevice::ObjectFromHandle(devices[deviceIdx])->GetCompiler()->DestroyPipelineBinaryCache(); } - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_pDevModeMgr->Destroy(); + m_pDevMode->Destroy(); } #endif @@ -1049,12 +1049,24 @@ void Instance::DevModeEarlyInitialize() { #if ICD_GPUOPEN_DEVMODE_BUILD VK_ASSERT(m_pPhysicalDeviceManager == nullptr); - VK_ASSERT(m_pDevModeMgr == nullptr); + VK_ASSERT(m_pDevMode == nullptr); // Initialize the devmode manager which abstracts interaction with the gpuopen dev driver component if (m_pPalPlatform->GetDevDriverServer() != nullptr) { - const VkResult result = DevModeMgr::Create(this, &m_pDevModeMgr); + VkResult result; + +#if PAL_BUILD_RDF + if ((m_pPalPlatform->GetTraceSession() != nullptr) && + (m_pPalPlatform->GetTraceSession()->IsTracingEnabled())) + { + result = DevModeUberTrace::Create(this, reinterpret_cast(&m_pDevMode)); + } + else +#endif + { + result = DevModeRgp::Create(this, reinterpret_cast(&m_pDevMode)); + } VK_ASSERT(result == VK_SUCCESS); } @@ -1068,16 +1080,16 @@ void Instance::DevModeLateInitialize() { #if ICD_GPUOPEN_DEVMODE_BUILD VK_ASSERT(m_pPhysicalDeviceManager != nullptr); - VK_ASSERT(m_pDevModeMgr != nullptr); + VK_ASSERT(m_pDevMode != nullptr); // Query if we need support for SQTT tracing, and notify the instance so that the correct dispatch table // layer can be installed. - if (m_pDevModeMgr->IsTracingEnabled()) + if (m_pDevMode->IsTracingEnabled()) { EnableTracingSupport(); } - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { EnableCrashAnalysisSupport(); } diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index 9f738431..75fdd9cf 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -42,6 +42,7 @@ #include "include/vk_utils.h" #include "include/vk_conv.h" #include "include/vk_surface.h" +#include "include/vk_indirect_commands_layout.h" #include "include/khronos/vk_icd.h" @@ -2079,6 +2080,27 @@ VkResult PhysicalDevice::GetImageFormatProperties( : formatProperties.linearTilingFeatures; } + // If extended usage was set, compute the extended feature set based on the type + // of image and the format compatibility classes. + if (Util::TestAnyFlagSet(flags, VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)) + { + if (Formats::IsYuvFormat(format) && (Formats::GetYuvPlaneCounts(format) > 1)) + { + VkFormat singlePlaneFormat = VK_FORMAT_UNDEFINED; + + for (uint32_t i = 0; i < Formats::GetYuvPlaneCounts(format); ++i) + { + singlePlaneFormat = Formats::GetCompatibleSinglePlaneFormat(format, i); + + supportedFeatures |= Formats::GetExtendedFeatureFlags(this, singlePlaneFormat, tiling, settings); + } + } + else + { + supportedFeatures |= Formats::GetExtendedFeatureFlags(this, format, tiling, settings); + } + } + // 3D textures with depth or stencil format are not supported if ((type == VK_IMAGE_TYPE_3D) && (Formats::HasDepth(format) || Formats::HasStencil(format))) { @@ -2116,14 +2138,7 @@ VkResult PhysicalDevice::GetImageFormatProperties( (((usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) != 0) && ((supportedFeatures & VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) == 0))) { - // If extended usage was set ignore the error. We do not know what format or usage is intended. - // However for Yuv and Depth images that do not have any compatible formats, report error always. - if (((flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT) == 0 )|| - Formats::IsYuvFormat(format) || - Formats::IsDepthStencilFormat(format)) - { - return VK_ERROR_FORMAT_NOT_SUPPORTED; - } + return VK_ERROR_FORMAT_NOT_SUPPORTED; } // Calculate maxResourceSize @@ -2582,11 +2597,11 @@ VkResult PhysicalDevice::GetPhysicalDeviceToolPropertiesEXT( bool isProfilingEnabled = false; VkResult result = VK_SUCCESS; - DevModeMgr* devModeMgr = VkInstance()->GetDevModeMgr(); + IDevMode* devMode = VkInstance()->GetDevModeMgr(); - if (devModeMgr != nullptr) + if (devMode != nullptr) { - isProfilingEnabled = devModeMgr->IsTracingEnabled(); + isProfilingEnabled = devMode->IsTracingEnabled(); } if (pToolProperties == nullptr) @@ -4382,6 +4397,12 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(NV_COMPUTE_SHADER_DERIVATIVES)); } + if ((pPhysicalDevice == nullptr) || (pPhysicalDevice->GetRuntimeSettings().exportNvDeviceGeneratedCommands)) + { + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(NV_DEVICE_GENERATED_COMMANDS)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(NV_DEVICE_GENERATED_COMMANDS_COMPUTE)); + } + if ((pPhysicalDevice == nullptr) || (pPhysicalDevice->GetRuntimeSettings().enableGraphicsPipelineLibraries)) { availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_GRAPHICS_PIPELINE_LIBRARY)); @@ -4415,6 +4436,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_NON_SEAMLESS_CUBE_MAP)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_MODULE_IDENTIFIER)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_MAXIMAL_RECONVERGENCE)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_EXTENDED_DYNAMIC_STATE3)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_VERTEX_INPUT_DYNAMIC_STATE)); @@ -4423,9 +4446,13 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_SUBGROUP_ROTATE)); - availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_FLOAT_CONTROLS2)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_FLOAT_CONTROLS2)); + + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_QUAD_CONTROL)); + + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_NESTED_COMMAND_BUFFER)); - availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_QUAD_CONTROL)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_DYNAMIC_RENDERING_LOCAL_READ)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_VERTEX_ATTRIBUTE_DIVISOR)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_INDEX_TYPE_UINT8)); @@ -5086,19 +5113,20 @@ void PhysicalDevice::GetPhysicalDeviceDotProduct8Properties( *pIntegerDotProduct8BitUnsignedAccelerated = int8DotSupport; *pIntegerDotProduct8BitSignedAccelerated = int8DotSupport; - *pIntegerDotProductAccumulatingSaturating8BitUnsignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating8BitSignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating8BitUnsignedAccelerated = int8DotSupport; + *pIntegerDotProductAccumulatingSaturating8BitSignedAccelerated = int8DotSupport; #if VKI_BUILD_GFX11 if (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { *pIntegerDotProduct8BitMixedSignednessAccelerated = VK_TRUE; + *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_TRUE; } else #endif { *pIntegerDotProduct8BitMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_FALSE; } } @@ -5117,19 +5145,20 @@ void PhysicalDevice::GetPhysicalDeviceDotProduct4x8Properties( *pIntegerDotProduct4x8BitPackedUnsignedAccelerated = int8DotSupport; *pIntegerDotProduct4x8BitPackedSignedAccelerated = int8DotSupport; - *pIntegerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = int8DotSupport; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = int8DotSupport; #if VKI_BUILD_GFX11 if (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { *pIntegerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_TRUE; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_TRUE; } else #endif { *pIntegerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_FALSE; } } @@ -7346,6 +7375,46 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV: + { + auto* pExtInfo = reinterpret_cast(pHeader); + if (updateFeatures) + { + pExtInfo->deviceGeneratedCommands = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_COMPUTE_FEATURES_NV: + { + auto* pExtInfo = reinterpret_cast(pHeader); + if (updateFeatures) + { + pExtInfo->deviceGeneratedCompute = VK_TRUE; + pExtInfo->deviceGeneratedComputePipelines = VK_FALSE; + pExtInfo->deviceGeneratedComputeCaptureReplay = VK_FALSE; + } + + structSize = sizeof(*pExtInfo); + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NESTED_COMMAND_BUFFER_FEATURES_EXT: + { + auto* pExtInfo = reinterpret_cast(pHeader); + if (updateFeatures) + { + pExtInfo->nestedCommandBuffer = VK_TRUE; + pExtInfo->nestedCommandBufferRendering = VK_TRUE; + pExtInfo->nestedCommandBufferSimultaneousUse = VK_FALSE; + } + + structSize = sizeof(*pExtInfo); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAME_BOUNDARY_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -8331,6 +8400,28 @@ void PhysicalDevice::GetDeviceProperties2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_PROPERTIES_NV: + { + auto* pProps = static_cast(pNext); + pProps->maxIndirectCommandsStreamCount = 1; + pProps->maxIndirectCommandsStreamStride = UINT32_MAX; + pProps->maxIndirectCommandsTokenCount = MaxIndirectTokenCount; + pProps->maxIndirectCommandsTokenOffset = MaxIndirectTokenOffset; + pProps->minIndirectCommandsBufferOffsetAlignment = 4; + pProps->minSequencesCountBufferOffsetAlignment = 4; + pProps->minSequencesIndexBufferOffsetAlignment = 4; + pProps->maxGraphicsShaderGroupCount = 0; + pProps->maxIndirectSequenceCount = UINT32_MAX >> 1; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NESTED_COMMAND_BUFFER_PROPERTIES_EXT: + { + auto* pProps = static_cast(pNext); + pProps->maxCommandBufferNestingLevel = 1; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_KHR: { auto* pProps = static_cast(pNext); diff --git a/icd/api/vk_physical_device_manager.cpp b/icd/api/vk_physical_device_manager.cpp index 8e16d303..eb308126 100644 --- a/icd/api/vk_physical_device_manager.cpp +++ b/icd/api/vk_physical_device_manager.cpp @@ -39,6 +39,7 @@ #include "palVectorImpl.h" #include #include +#include "settings/experimentsLoader.h" namespace vk { @@ -51,7 +52,8 @@ PhysicalDeviceManager::PhysicalDeviceManager( m_pInstance(pInstance), m_pDisplayManager(pDisplayManager), m_devices(pInstance->Allocator()), - m_pAllNullProperties(nullptr) + m_pAllNullProperties(nullptr), + m_pExperimentsLoader(nullptr) { } @@ -107,7 +109,27 @@ VkResult PhysicalDeviceManager::Create( // ===================================================================================================================== VkResult PhysicalDeviceManager::Initialize() { - return UpdateLockedPhysicalDeviceList(); + VkResult result = VK_SUCCESS; + + void* pExpLoader = m_pInstance->AllocMem(sizeof(ExperimentsLoader), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pExpLoader != nullptr) + { + m_pExperimentsLoader = VK_PLACEMENT_NEW(pExpLoader) ExperimentsLoader(m_pInstance->PalPlatform()); + + result = PalToVkResult(m_pExperimentsLoader->Init()); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + if (result == VK_SUCCESS) + { + result = UpdateLockedPhysicalDeviceList(); + } + + return result; } // ===================================================================================================================== @@ -118,6 +140,12 @@ PhysicalDeviceManager::~PhysicalDeviceManager() m_pInstance->FreeMem(m_pAllNullProperties); } + if (m_pExperimentsLoader != nullptr) + { + m_pExperimentsLoader->~ExperimentsLoader(); + m_pInstance->FreeMem(m_pExperimentsLoader); + } + DestroyLockedPhysicalDeviceList(); } @@ -274,7 +302,8 @@ VkResult PhysicalDeviceManager::UpdateLockedPhysicalDeviceList(void) if (pLoader != nullptr) { settingsArray[i] = VK_PLACEMENT_NEW(pLoader) VulkanSettingsLoader(pPalDeviceList[i], - m_pInstance->PalPlatform()); + m_pInstance->PalPlatform(), + m_pExperimentsLoader); } else { diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp index 862380fe..0a793d0b 100644 --- a/icd/api/vk_pipeline.cpp +++ b/icd/api/vk_pipeline.cpp @@ -280,11 +280,11 @@ VkResult Pipeline::BuildShaderStageInfo( uint32_t maxOutIdx = 0; - for (uint32_t i = 0; i < stageCount; ++i) + for (uint32_t stageIdx = 0; stageIdx < stageCount; ++stageIdx) { - const VkPipelineShaderStageCreateInfo& stageInfo = pStages[i]; + const VkPipelineShaderStageCreateInfo& stageInfo = pStages[stageIdx]; const ShaderStage stage = ShaderFlagBitToStage(stageInfo.stage); - const uint32_t outIdx = pfnGetOutputIdx(i, stage); + const uint32_t outIdx = pfnGetOutputIdx(stageIdx, stage); maxOutIdx = Util::Max(maxOutIdx, outIdx + 1); @@ -335,28 +335,30 @@ VkResult Pipeline::BuildShaderStageInfo( if (pShaderModuleCreateInfo != nullptr) { - flags = pShaderModuleCreateInfo->flags; + flags = pShaderModuleCreateInfo->flags; shaderBinary.codeSize = pShaderModuleCreateInfo->codeSize; shaderBinary.pCode = pShaderModuleCreateInfo->pCode; codeHash = ShaderModule::BuildCodeHash( shaderBinary.pCode, shaderBinary.codeSize); - } - if (shaderBinary.pCode != nullptr) - { - result = pCompiler->BuildShaderModule( - pDevice, - flags, - VK_INTERNAL_SHADER_FLAGS_FORCE_UNCACHED_BIT, - shaderBinary, - &pTempModules[outIdx]); - - pShaderStageInfo[outIdx].pModuleHandle = &pTempModules[outIdx]; - pShaderStageInfo[outIdx].codeSize = shaderBinary.codeSize; + if (shaderBinary.pCode != nullptr) + { + result = pCompiler->BuildShaderModule( + pDevice, + flags, + VK_INTERNAL_SHADER_FLAGS_FORCE_UNCACHED_BIT, + shaderBinary, + &pTempModules[outIdx]); + + pTempModules[outIdx].codeHash = codeHash; + pShaderStageInfo[outIdx].pModuleHandle = &pTempModules[outIdx]; + pShaderStageInfo[outIdx].codeSize = shaderBinary.codeSize; + } } - else if (pPipelineShaderStageModuleIdentifierCreateInfoEXT != nullptr) + + if (pPipelineShaderStageModuleIdentifierCreateInfoEXT != nullptr) { // Get the 128 bit ShaderModule Hash VK_ASSERT(pPipelineShaderStageModuleIdentifierCreateInfoEXT->identifierSize == @@ -377,11 +379,7 @@ VkResult Pipeline::BuildShaderStageInfo( break; } - pShaderStageInfo[outIdx].codeHash = ShaderModule::GetCodeHash(codeHash, stageInfo.pName); - if (pShaderStageInfo[outIdx].pModuleHandle == &pTempModules[outIdx]) - { - pTempModules[outIdx].codeHash = pShaderStageInfo[outIdx].codeHash; - } + pShaderStageInfo[outIdx].codeHash = ShaderModule::GetCodeHash(codeHash, stageInfo.pName); } pShaderStageInfo[outIdx].stage = stage; @@ -446,9 +444,9 @@ void Pipeline::HandleExtensionStructs( // ===================================================================================================================== Pipeline::Pipeline( - Device* const pDevice, + Device* const pDevice, #if VKI_RAY_TRACING - bool hasRayTracing, + bool hasRayTracing, #endif VkPipelineBindPoint type) : @@ -1412,6 +1410,24 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineExecutableInternalRepresentationsKHR return (*pInternalRepresentationCount < numberOfInternalRepresentations) ? VK_INCOMPLETE : VK_SUCCESS; } +// ===================================================================================================================== +VKAPI_ATTR VkDeviceAddress VKAPI_CALL vkGetPipelineIndirectDeviceAddressNV( + VkDevice device, + const VkPipelineIndirectDeviceAddressInfoNV* pInfo) +{ + VK_NOT_IMPLEMENTED; + return 0; +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetPipelineIndirectMemoryRequirementsNV( + VkDevice device, + const VkComputePipelineCreateInfo* pCreateInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + VK_NOT_IMPLEMENTED; +} + } // namespace entry } // namespace vk diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp index 2925f676..71d28ffa 100644 --- a/icd/api/vk_queue.cpp +++ b/icd/api/vk_queue.cpp @@ -54,6 +54,9 @@ #include "sqtt/sqtt_layer.h" #include "palQueue.h" +#include "palVectorImpl.h" +#include "palListImpl.h" + namespace vk { @@ -79,10 +82,14 @@ Queue::Queue( m_queueFamilyIndex(queueFamilyIndex), m_queueIndex(queueIndex), m_queueFlags(queueFlags), - m_pDevModeMgr(pDevice->VkInstance()->GetDevModeMgr()), + m_pDevMode(pDevice->VkInstance()->GetDevModeMgr()), m_pStackAllocator(pStackAllocator), m_pCmdBufferRing(pCmdBufferRing), m_isDeviceIndependent(isDeviceIndependent) +#if VKI_RAY_TRACING + , m_pCpsGlobalMem(nullptr) + , m_cpsMemDestroyList(pDevice->VkInstance()->Allocator()) +#endif { if (ppPalQueues != nullptr) { @@ -881,6 +888,16 @@ Queue::~Queue() } } +#if VKI_RAY_TRACING + FreeRetiredCpsStackMem(); + VK_ASSERT(m_cpsMemDestroyList.NumElements() == 0); + + if (m_pCpsGlobalMem != nullptr) + { + m_pDevice->MemMgr()->FreeGpuMem(m_pCpsGlobalMem); + m_pCpsGlobalMem = nullptr; + } +#endif } // ===================================================================================================================== @@ -1073,9 +1090,9 @@ VkResult Queue::Submit( VkFence fence) { #if ICD_GPUOPEN_DEVMODE_BUILD - DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); - bool timedQueueEvents = ((pDevModeMgr != nullptr) && pDevModeMgr->IsQueueTimingActive(m_pDevice)); + bool timedQueueEvents = ((pDevMode != nullptr) && pDevMode->IsQueueTimingActive(m_pDevice)); #else bool timedQueueEvents = false; #endif @@ -1087,6 +1104,10 @@ VkResult Queue::Submit( const bool isSynchronization2 = std::is_same::value; +#if VKI_RAY_TRACING + FreeRetiredCpsStackMem(); +#endif + // The fence should be only used in the last submission to PAL. The implicit ordering guarantees provided by PAL // make sure that the fence is only signaled when all submissions complete. if ((submitCount == 0) && (pFence != nullptr)) @@ -1168,7 +1189,7 @@ VkResult Queue::Submit( case VK_STRUCTURE_TYPE_FRAME_BOUNDARY_EXT: // Note: VK_EXT_frame_boundary is only intended for tools/debuggers // to be able to associate frame information with queue submissions. - DevModeFrameBoundary(pDevModeMgr, static_cast(pNext)); + DevModeFrameBoundary(pDevMode, static_cast(pNext)); break; default: @@ -1296,6 +1317,25 @@ VkResult Queue::Submit( ApiCmdBuffer* const * pCommandBuffers = reinterpret_cast(pCmdBuffers); +#if VKI_RAY_TRACING + uint64 maxCpsStackSize = 0; + Pal::IFence* pCpsMemFence = nullptr; + + for (uint32_t i = 0; i < cmdBufferCount; ++i) + { + const CmdBuffer& cmdBuf = *(*pCommandBuffers[i]); + + if (cmdBuf.GetCpsMemSize() > 0) + { + maxCpsStackSize = Util::Max(maxCpsStackSize, cmdBuf.GetCpsMemSize()); + } + } + + if (maxCpsStackSize > 0) + { + pCpsMemFence = GetCpsStackMem(deviceIdx, maxCpsStackSize); + } +#endif perSubQueueInfo.cmdBufferCount = 0; palSubmitInfo.stackSizeInDwords = 0; @@ -1320,6 +1360,11 @@ VkResult Queue::Submit( { pCmdBufInfos[i].isValid = true; pCmdBufInfos[i].rayTracingExecuted = true; + + if (m_pCpsGlobalMem != nullptr) + { + cmdBuf.ApplyPatchCpsRequests(deviceIdx, *m_pCpsGlobalMem->PalMemory(deviceIdx)); + } } #endif @@ -1402,11 +1447,23 @@ VkResult Queue::Submit( } } + Pal::IFence* iFence[2] = {nullptr, nullptr}; + palSubmitInfo.ppFences = iFence; + +#if VKI_RAY_TRACING + if (pCpsMemFence != nullptr) + { + iFence[0] = pCpsMemFence; + palSubmitInfo.fenceCount = 1; + } +#endif + if (lastBatch && (pFence != nullptr)) { pPalFence = pFence->PalFence(deviceIdx); - palSubmitInfo.ppFences = &pPalFence; - palSubmitInfo.fenceCount = 1; + + iFence[palSubmitInfo.fenceCount] = pPalFence; + palSubmitInfo.fenceCount++; pFence->SetActiveDevice(deviceIdx); } @@ -1483,7 +1540,9 @@ VkResult Queue::Submit( } else { - palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + { + palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + } } } else @@ -1492,7 +1551,7 @@ VkResult Queue::Submit( // TMZ is NOT supported for GPUOPEN path. VK_ASSERT((*pCommandBuffers[0])->IsProtected() == false); - palResult = m_pDevModeMgr->TimedQueueSubmit( + palResult = m_pDevMode->TimedQueueSubmit( deviceIdx, this, cmdBufferCount, @@ -1609,10 +1668,10 @@ VkResult Queue::PalSignalSemaphores( const uint32_t* pSemaphoreDeviceIndices) { #if ICD_GPUOPEN_DEVMODE_BUILD - DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); - bool timedQueueEvents = ((pDevModeMgr != nullptr) && - pDevModeMgr->IsQueueTimingActive(m_pDevice)); + bool timedQueueEvents = ((pDevMode != nullptr) && + pDevMode->IsQueueTimingActive(m_pDevice)); #else bool timedQueueEvents = false; #endif @@ -1657,7 +1716,7 @@ VkResult Queue::PalSignalSemaphores( else { #if ICD_GPUOPEN_DEVMODE_BUILD - palResult = pDevModeMgr->TimedSignalQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, + palResult = pDevMode->TimedSignalQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, pPalSemaphore); #else VK_NEVER_CALLED(); @@ -1686,10 +1745,10 @@ VkResult Queue::PalWaitSemaphores( uint32_t deviceIdx = DefaultDeviceIndex; #if ICD_GPUOPEN_DEVMODE_BUILD - DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); - bool timedQueueEvents = ((pDevModeMgr != nullptr) && - pDevModeMgr->IsQueueTimingActive(m_pDevice)); + bool timedQueueEvents = ((pDevMode != nullptr) && + pDevMode->IsQueueTimingActive(m_pDevice)); #else bool timedQueueEvents = false; #endif @@ -1736,7 +1795,7 @@ VkResult Queue::PalWaitSemaphores( else { #if ICD_GPUOPEN_DEVMODE_BUILD - palResult = pDevModeMgr->TimedWaitQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, + palResult = pDevMode->TimedWaitQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, pPalSemaphore); #else VK_NEVER_CALLED(); @@ -1759,7 +1818,7 @@ VkResult Queue::UpdateFlipStatus( { bool isOwner = false; Pal::IDevice* pPalDevice = m_pDevice->PalDevice(DefaultDeviceIndex); - uint32_t vidPnSourceId = pSwapChain->GetFullscreenMgr()->GetVidPnSourceId(); + uint32_t vidPnSourceId = pSwapChain->GetVidPnSourceId(); Pal::Result palResult = pPalDevice->GetFlipStatus(vidPnSourceId, &m_flipStatus.flipFlags, &isOwner); if (palResult == Pal::Result::Success) @@ -1805,9 +1864,9 @@ VkResult Queue::Present( if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); } #endif return VK_ERROR_INITIALIZATION_FAILED; @@ -1943,7 +2002,7 @@ VkResult Queue::Present( if (pSwapChain->GetFullscreenMgr() != nullptr) { Pal::Result palResult = m_pDevice->PalDevice(DefaultDeviceIndex)->PollFullScreenFrameMetadataControl( - pSwapChain->GetFullscreenMgr()->GetVidPnSourceId(), + pSwapChain->GetVidPnSourceId(), &m_palFrameMetadataControl); VK_ASSERT(palResult == Pal::Result::Success); @@ -1973,7 +2032,7 @@ VkResult Queue::Present( if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); } #endif @@ -2017,7 +2076,7 @@ VkResult Queue::Present( if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); } #endif @@ -2603,9 +2662,7 @@ bool Queue::BuildPostProcessCommands( frameInfo.pSrcImage = pPresentInfo->pSrcImage; frameInfo.debugOverlay.presentMode = pPresentInfo->presentMode; frameInfo.debugOverlay.wsiPlatform = displayableInfo.palPlatform; - frameInfo.debugOverlay.presentKey = pSwapChain->IsDxgiEnabled() - ? Pal::PresentKeyFromPointer(pPresentInfo->pSwapChain) - : Pal::PresentKeyFromOsWindowHandle(displayableInfo.windowHandle); + frameInfo.debugOverlay.presentKey = Pal::PresentKeyFromOsWindowHandle(displayableInfo.windowHandle); } else { @@ -2645,42 +2702,6 @@ VkResult Queue::SubmitInternalCmdBuf( return pRing->SubmitCmdBuffer(m_pDevice, deviceIdx, m_pPalQueues[deviceIdx], cmdBufInfo, pCmdBufState); } -// ===================================================================================================================== -// Synchronize back buffer memory by doing a dummy submit with the written primary field set. -VkResult Queue::SynchronizeBackBuffer( - Memory* pMemory, - uint32_t deviceIdx) -{ - VkResult result = VK_SUCCESS; - - if (m_pDummyCmdBuffer[deviceIdx] == nullptr) - { - result = CreateDummyCmdBuffer(); - } - - if (result == VK_SUCCESS) - { - Pal::IGpuMemory* pGpuMem = pMemory->PalMemory(deviceIdx); - - Pal::PerSubQueueSubmitInfo perSubQueueInfo = {}; - - perSubQueueInfo.cmdBufferCount = 1; - perSubQueueInfo.ppCmdBuffers = &m_pDummyCmdBuffer[deviceIdx]; - perSubQueueInfo.pCmdBufInfoList = nullptr; - - Pal::SubmitInfo submitInfo = {}; - - submitInfo.pPerSubQueueInfo = &perSubQueueInfo; - submitInfo.perSubQueueInfoCount = 1; - submitInfo.blockIfFlippingCount = 1; - submitInfo.ppBlockIfFlipping = &pGpuMem; - - result = PalToVkResult(PalQueueSubmit(m_pDevice, m_pPalQueues[deviceIdx], submitInfo)); - } - - return result; -} - VkResult Queue::CreateSqttState( void* pMemory) { @@ -2701,7 +2722,7 @@ void Queue::InsertDebugUtilsLabel( #if ICD_GPUOPEN_DEVMODE_BUILD if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { - m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, DevModeMgr::FrameDelimiterType::QueueLabel); + m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, IDevMode::FrameDelimiterType::QueueLabel); if (settings.devModeBlockingEndFrameDebugUtils) { @@ -2720,7 +2741,7 @@ void Queue::InsertDebugUtilsLabel( #if ICD_GPUOPEN_DEVMODE_BUILD if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { - m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, DevModeMgr::FrameDelimiterType::QueueLabel); + m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, IDevMode::FrameDelimiterType::QueueLabel); } #endif } @@ -2730,24 +2751,138 @@ void Queue::InsertDebugUtilsLabel( // Notifies the trace tool about frame boundary, which is mainly used for unconventional vkQueuePresent-less // rendering or compute-only applications. void Queue::DevModeFrameBoundary( - DevModeMgr* pDevModeMgr, + IDevMode* pDevMode, const VkFrameBoundaryEXT* pFrameBoundaryInfo) { #if ICD_GPUOPEN_DEVMODE_BUILD - if ((pDevModeMgr != nullptr) && + if ((pDevMode != nullptr) && (pFrameBoundaryInfo != nullptr)) { if ((pFrameBoundaryInfo->flags & VK_FRAME_BOUNDARY_FRAME_END_BIT_EXT) != 0) { - pDevModeMgr->NotifyFrameEnd(this, - DevModeMgr::FrameDelimiterType::QueuePresent); - pDevModeMgr->NotifyFrameBegin(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + pDevMode->NotifyFrameEnd(this, + IDevMode::FrameDelimiterType::QueuePresent); + pDevMode->NotifyFrameBegin(this, + IDevMode::FrameDelimiterType::QueuePresent); } } #endif } +#if VKI_RAY_TRACING +// ===================================================================================================================== +// Check the allocations in m_cpsMemDestroyList, free the retired ones. +void Queue::FreeRetiredCpsStackMem() +{ + if (m_cpsMemDestroyList.NumElements() > 0) + { + for (CpsMemDestroyListIterator iter = m_cpsMemDestroyList.Begin(); iter.Get() != nullptr; ) + { + CpsMemTracker* pTracker = iter.Get(); + if (pTracker->pFence->GetStatus() == Pal::Result::Success) + { + m_pDevice->MemMgr()->FreeGpuMem(pTracker->pMem); + pTracker->pFence->Destroy(); + m_pDevice->VkInstance()->FreeMem(pTracker->pFence); + + // implicitly preceed the iterator to next node. + m_cpsMemDestroyList.Erase(&iter); + } + else + { + break; + } + } + } +} + +// ===================================================================================================================== +// Get Cps global memory. +// - Allocate if it does not exist. +// - Reallocate m_pCpsGlobalMem from X To Y if its size is not big enough. X is put into m_cpsMemDestroyList to be freed +// later. A fence is generated and passed in the submission to Pal. When it is signaled, X is freed. Note it is +// signaled when the first cmdbuf switching to Y is done, so not optimal regarding memory footprint. Ideally it can be +// signalled when X is retired, but that means every submission referencing X has to signal an extra IFence even +// m_pCpsGlobalMem stays unchanged. The reason is we dont know if the next submission will require a bigger cps stack +// memory. +Pal::IFence* Queue::GetCpsStackMem( + uint32_t deviceIdx, + uint64_t size) +{ + VK_ASSERT(m_pDevice->GetRuntimeSettings().cpsFlags & CpsFlagStackInGlobalMem); + + Pal::IFence* pFence = nullptr; + GpuRt::IDevice* pRtDevice = m_pDevice->RayTrace()->GpuRt(deviceIdx); + + //TODO: cap the size to a reasonable preset + if ((m_pCpsGlobalMem == nullptr) || (m_pCpsGlobalMem->Size() < size)) + { + InternalMemory* pCpsVidMem = nullptr; + + InternalMemCreateInfo allocInfo = {}; + m_pDevice->MemMgr()->GetCommonPool(InternalPoolGpuAccess, &allocInfo); + + m_pDevice->MemMgr()->AllocGpuMem(allocInfo, pCpsVidMem, 0, VK_OBJECT_TYPE_QUEUE, 0); + VK_ASSERT(pCpsVidMem != nullptr); + + Pal::Result palResult = Pal::Result::Success; + + if (m_pCpsGlobalMem == nullptr) // first alloc + { + m_pCpsGlobalMem = pCpsVidMem; + } + else if (pCpsVidMem != nullptr) + { + Pal::IDevice* pPalDevice = m_pDevice->PalDevice(deviceIdx); + + const size_t palFenceSize = pPalDevice->GetFenceSize(&palResult); + VK_ASSERT(palResult == Pal::Result::Success); + + void* pPalMemory = m_pDevice->VkInstance()->AllocMem(palFenceSize, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + Pal::FenceCreateInfo fenceInfo = {}; + fenceInfo.flags.signaled = 0; + + if (pPalMemory != nullptr) + { + palResult = pPalDevice->CreateFence(fenceInfo, pPalMemory, &pFence); + + if (palResult == Pal::Result::Success) + { + CpsMemTracker tracker = { m_pCpsGlobalMem, pFence }; + m_cpsMemDestroyList.PushBack(tracker); + m_pCpsGlobalMem = pCpsVidMem; + } + else + { + VK_ASSERT(pFence == nullptr); + m_pDevice->VkInstance()->FreeMem(pPalMemory); + } + } + else + { + palResult = Pal::Result::ErrorOutOfMemory; + } + + if (palResult != Pal::Result::Success) + { + // Have to bear with the original allocation, expecting performance hit + m_pDevice->MemMgr()->FreeGpuMem(pCpsVidMem); + } + } + + // Initialize CPS Memory + if (palResult == Pal::Result::Success) + { + palResult = pRtDevice->InitializeCpsMemory(*m_pCpsGlobalMem->PalMemory(deviceIdx), size); + VK_ASSERT(palResult == Pal::Result::Success); + } + } + + return pFence; +} +#endif + /** *********************************************************************************************************************** * C-Callable entry points start here. These entries go in the dispatch table(s). diff --git a/icd/api/vk_semaphore.cpp b/icd/api/vk_semaphore.cpp index daae49f5..4223459a 100644 --- a/icd/api/vk_semaphore.cpp +++ b/icd/api/vk_semaphore.cpp @@ -481,7 +481,7 @@ VkResult Semaphore::WaitSemaphoreValue( VK_ASSERT(pSemaphore->IsTimelineSemaphore()); pPalSemaphore = pSemaphore->PalSemaphore(DefaultDeviceIndex); pSemaphore->RestoreSemaphore(); - palResult = pPalSemaphore->WaitSemaphoreValue(value, timeout); + palResult = pPalSemaphore->WaitSemaphoreValue(value, Uint64ToChronoNano(timeout)); } return PalToVkResult(palResult); diff --git a/icd/api/vk_swapchain.cpp b/icd/api/vk_swapchain.cpp index 5d8f2d6e..5436a013 100644 --- a/icd/api/vk_swapchain.cpp +++ b/icd/api/vk_swapchain.cpp @@ -67,6 +67,7 @@ SwapChain::SwapChain( const Properties& properties, VkPresentModeKHR presentMode, FullscreenMgr* pFullscreenMgr, + uint32_t vidPnSourceId, Pal::WorkstationStereoMode wsStereoMode, Pal::ISwapChain* pPalSwapChain) : @@ -81,7 +82,8 @@ SwapChain::SwapChain( m_presentCount(0), m_presentMode(presentMode), m_deprecated(false) - , m_wsStereoMode(wsStereoMode) + , m_vidPnSourceId(vidPnSourceId), + m_wsStereoMode(wsStereoMode) { // Initialize the color gamut with the native values. if (m_pFullscreenMgr != nullptr) @@ -443,8 +445,7 @@ VkResult SwapChain::Create( mode, pScreen, screenProperties.hDisplay, - swapChainCreateInfo.hWindow, - screenProperties.vidPnSourceId); + swapChainCreateInfo.hWindow); } } @@ -529,6 +530,7 @@ VkResult SwapChain::Create( properties, pCreateInfo->presentMode, pFullscreenMgr, + screenProperties.vidPnSourceId, wsStereoMode, pPalSwapChain); @@ -584,6 +586,7 @@ void SwapChain::Init(const VkAllocationCallbacks* pAllocator) { result = SetupAutoStereo(pAllocator); } + } // ===================================================================================================================== @@ -765,7 +768,7 @@ VkResult SwapChain::AcquireNextImage( if (result == VK_SUCCESS) { - acquireInfo.timeout = timeout; + acquireInfo.timeout = Uint64ToChronoNano(timeout); acquireInfo.pSemaphore = (pSemaphore != nullptr) ? pSemaphore->PalSemaphore(DefaultDeviceIndex) : nullptr; @@ -873,6 +876,14 @@ VkResult SwapChain::GetSwapchainImagesKHR( return result; } +// ===================================================================================================================== +bool SwapChain::IsFullscreenOrEfsePresent() const +{ + return ( + ((m_pFullscreenMgr != nullptr) && m_pFullscreenMgr->GetExclusiveModeFlags().acquired) + ); +} + // ===================================================================================================================== // Fills in the PAL swap chain present info with the appropriate image to present and returns its GPU memory. Pal::IGpuMemory* SwapChain::UpdatePresentInfo( @@ -896,11 +907,16 @@ Pal::IGpuMemory* SwapChain::UpdatePresentInfo( // Let the fullscreen manager perform any fullscreen ownership transitions and override some of this present // information in case it has enabled fullscreen. - if (m_pFullscreenMgr != nullptr) + if ((m_pFullscreenMgr != nullptr) + ) { - m_pFullscreenMgr->UpdatePresentInfo(this, pPresentInfo, flipFlags); + m_pFullscreenMgr->TryEnterExclusive(this); } + // Always fallback to windowed if FSE is not acquired to avoid missing presents. + pPresentInfo->presentMode = + IsFullscreenOrEfsePresent() ? Pal::PresentMode::Fullscreen : Pal::PresentMode::Windowed; + return pSrcImageGpuMemory; } @@ -1177,8 +1193,7 @@ FullscreenMgr::FullscreenMgr( FullscreenMgr::Mode mode, Pal::IScreen* pScreen, Pal::OsDisplayHandle hDisplay, - Pal::OsWindowHandle hWindow, - uint32_t vidPnSourceId) + Pal::OsWindowHandle hWindow) : m_pDevice{pDevice}, m_exclusiveModeFlags{}, @@ -1187,7 +1202,6 @@ FullscreenMgr::FullscreenMgr( m_fullscreenPresentSuccessCount{0}, m_hDisplay{hDisplay}, m_hWindow{hWindow}, - m_vidPnSourceId{vidPnSourceId}, m_mode{mode} { VK_ASSERT(m_pScreen != nullptr); @@ -1199,6 +1213,7 @@ FullscreenMgr::FullscreenMgr( bool FullscreenMgr::TryEnterExclusive( SwapChain* pSwapChain) { + // If we are not perma-disabled if (m_exclusiveModeFlags.disabled == 0) { @@ -1270,8 +1285,6 @@ bool FullscreenMgr::TryExitExclusive( if (m_pScreen != nullptr) { Pal::Result palResult = m_pScreen->ReleaseFullscreenOwnership(); - - VK_ASSERT((m_exclusiveModeFlags.acquired == 0) || (palResult == Pal::Result::Success)); } m_exclusiveModeFlags.acquired = 0; @@ -1500,7 +1513,7 @@ void FullscreenMgr::PostPresent( // DXGI fullscreen is OS controlled and may go in and out of fullscreen mode to deal with user interaction, // display toasts etc. Ignore reporting fullscreen errors on this platform. if ((m_exclusiveModeFlags.acquired == 0) && (m_exclusiveModeFlags.mismatchedDisplayMode == 0) && - (m_mode == Mode::Explicit) && (pSwapChain->IsDxgiEnabled() == false)) + (m_mode == Mode::Explicit)) { *pPresentResult = Pal::Result::ErrorFullscreenUnavailable; } @@ -1512,34 +1525,6 @@ void FullscreenMgr::PostPresent( } } -// ===================================================================================================================== -// This function potentially overrides normal swap chain present info by replacing a windowed present with a page- -// flipped fullscreen present. -// -// This can only happen if the screen is currently compatible with fullscreen presents and we have successfully -// acquired exclusive access to the screen. -void FullscreenMgr::UpdatePresentInfo( - SwapChain* pSwapChain, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags) -{ - // Present mode does not matter in DXGI as it is completely OS handled. This is for our internal tracking only - if (pSwapChain->IsDxgiEnabled()) - { - // If KMD reported we're in Indpendent Flip we can assume that DXGI acquired FSE. - pPresentInfo->presentMode = flipFlags.iFlip ? Pal::PresentMode::Fullscreen : Pal::PresentMode::Windowed; - } - // Try to enter (or remain in) exclusive access mode on this swap chain's screen for this present - else - { - TryEnterExclusive(pSwapChain); - - // Always fallback to windowed if FSE is not acquired to avoid missing presents. - pPresentInfo->presentMode = - m_exclusiveModeFlags.acquired ? Pal::PresentMode::Fullscreen : Pal::PresentMode::Windowed; - } -} - // ===================================================================================================================== // This function determines whether it's safe to acquire full screen exclusive or not. Pal::Result FullscreenMgr::IsFullscreenOwnershipSafe() const diff --git a/icd/api/vk_utils.cpp b/icd/api/vk_utils.cpp index a99dd888..45605b97 100644 --- a/icd/api/vk_utils.cpp +++ b/icd/api/vk_utils.cpp @@ -44,7 +44,6 @@ uint32_t GetBuildTimeHash() return Util::HashLiteralString(__DATE__ __TIME__); } -#if DEBUG // ===================================================================================================================== // If turned on and exe name is a match, this function spins idle until we have a debugger hooked. void WaitIdleForDebugger( @@ -81,7 +80,6 @@ void WaitIdleForDebugger( } } } -#endif } // namespace utils diff --git a/icd/res/ver.h b/icd/res/ver.h index 8f26df38..255caefd 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 301 +#define VULKAN_ICD_BUILD_VERSION 304 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION @@ -45,7 +45,7 @@ // These values specify the driver ID and driver info string #define VULKAN_DRIVER_ID VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR // "AMDOPEN" #define VULKAN_DRIVER_NAME_STR "AMD open-source driver" -#define VULKAN_DRIVER_INFO_STR "2024.Q1.3" +#define VULKAN_DRIVER_INFO_STR "2024.Q2.1" #define VULKAN_DRIVER_INFO_STR_LLPC "(LLPC)" // These values tell which version of the conformance test the driver is compliant against diff --git a/icd/settings/experimentsLoader.cpp b/icd/settings/experimentsLoader.cpp new file mode 100644 index 00000000..3dc66fe8 --- /dev/null +++ b/icd/settings/experimentsLoader.cpp @@ -0,0 +1,71 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#include "experimentsLoader.h" +#include "dd_settings_service.h" +#include "palPlatform.h" + +namespace vk +{ +// ===================================================================================================================== +ExperimentsLoader::ExperimentsLoader( + Pal::IPlatform* pPlatform) + : + DevDriver::SettingsBase(&m_settings, sizeof(m_settings)), + m_pPlatform(pPlatform) +{ + +} + +// ===================================================================================================================== +ExperimentsLoader::~ExperimentsLoader() +{ +} + +// ===================================================================================================================== +void ExperimentsLoader::Destroy() +{ +} + +// ===================================================================================================================== +Pal::Result ExperimentsLoader::Init() +{ + Pal::Result palResult = Pal::Result::Unsupported; + DD_RESULT result = SetupDefaultsAndPopulateMap(); + if (result == DD_RESULT_SUCCESS) + { + DevDriver::SettingsRpcService* pSettingsRpcService = m_pPlatform->GetSettingsRpcService(); + if (pSettingsRpcService) + { + pSettingsRpcService->RegisterSettingsComponent(this); + } + + palResult = Pal::Result::Success; + } + + return palResult; +} + +} diff --git a/icd/settings/experimentsLoader.h b/icd/settings/experimentsLoader.h new file mode 100644 index 00000000..7924f334 --- /dev/null +++ b/icd/settings/experimentsLoader.h @@ -0,0 +1,79 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "dd_settings_base.h" +#include "pal.h" + +#include "settings/g_experiments.h" + +// Forward declarations +namespace Pal +{ +class IPlatform; +} + +namespace vk +{ +// ===================================================================================================================== +// This class is responsible for loading the ExpSettings structure specified in the +// constructor. This is a helper class that only exists for a short time while the settings +// are initialized. +class ExperimentsLoader final : public DevDriver::SettingsBase +{ +public: + explicit ExperimentsLoader(Pal::IPlatform* pPlatform); + + virtual ~ExperimentsLoader(); + + Pal::Result Init(); + + void Destroy(); + + // Returns a const pointer to the settings struct + const ExpSettings* GetExpSettings() const { return &m_settings; } + // Returns a non-const pointer to the settings struct, should only be used when the settings will be modified + ExpSettings* GetMutableExpSettings() { return &m_settings; } + + // Auto-generated + uint64_t GetSettingsBlobHash() const override; + + void ReportVsyncState(ExpVSyncControl state) { m_settings.expVerticalSynchronization = state; } + +private: + + PAL_DISALLOW_COPY_AND_ASSIGN(ExperimentsLoader); + PAL_DISALLOW_DEFAULT_CTOR(ExperimentsLoader); + + // Auto-generated functions + virtual const char* GetComponentName() const override; + virtual DD_RESULT SetupDefaultsAndPopulateMap() override; + + Pal::IPlatform* m_pPlatform; + ExpSettings m_settings; +}; + +} diff --git a/icd/settings/experiments_settings_xgl.json b/icd/settings/experiments_settings_xgl.json new file mode 100644 index 00000000..79466356 --- /dev/null +++ b/icd/settings/experiments_settings_xgl.json @@ -0,0 +1,287 @@ +{ + "Version": 1, + "ComponentName": "Experiments", + "Enums": [ + { + "Name": "ExpShaderWaveSize", + "Values": [ + { + "Name": "ExpWaveSizeAuto", + "Value": 0, + "Description": "Select automatically" + }, + { + "Name": "ExpWaveSizeWave64", + "Value": 2, + "Description": "Force 64 threads per wave" + }, + { + "Name": "ExpWaveSizeWave32", + "Value": 3, + "Description": "Force 32 threads per wave" + }, + { + "Name": "ExpWaveSizeInvalid", + "Value": 4, + "Description": "Invalid Wave Size" + } + ] + } + ], + "Settings": [ + { + "Description": "Disable mesh shader support as reported by graphics API.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpMeshShaderSupport", + "ExperimentName": "Disable Mesh Shader Support" + }, + { + "Description": "Disable support for ray tracing as reported by graphics API.", + "Tags": [ + "Feature" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Type": "bool", + "Name": "ExpRayTracingSupport", + "ExperimentName": "Disable Ray Tracing Support" + }, + { + "Description": "Disable support for variable rate shading as reported by graphics API.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpVariableRateShadingSupport", + "ExperimentName": "Disable Variable Rate Shading" + }, + { + "Description": "Disable support for native 16-bit types in shaders.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpNative16BitTypesSupport", + "ExperimentName": "Disable native 16-bit types support" + }, + { + "Description": "Disable support for custom AMD extensions, as offered by AMD GPU Services library in DX12 and VK_AMD_* extensions in Vulkan.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpAmdVendorExtensions", + "ExperimentName": "Disable AMD vendor extensions support" + }, + { + "Description": "Disable asynchronous compute queues. When disabled, Vulkan doesn't expose additional compute queues.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpComputeQueueSupport", + "ExperimentName": "Disable compute queue support" + }, + { + "Description": "Disable barrier optimizations.", + "Tags": [ + "Optimization" + ], + "Type": "bool", + "Name": "ExpBarrierOptimizations", + "ExperimentName": "Disable barrier optimizations" + }, + { + "Description": "Disable miscellaneous shader compiler optimizations.", + "Tags": [ + "Optimization" + ], + "Type": "bool", + "Name": "ExpShaderCompilerOptimizations", + "ExperimentName": "Disable shader compiler optimizations" + }, + { + "Description": "Disable optimizations applied when building ray tracing acceleration structures.", + "Tags": [ + "Optimization" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Type": "bool", + "Name": "ExpAccelStructureOpt", + "ExperimentName": "Disable acceleration structure optimizations" + }, + { + "Description": "Force specific wave (subgroup) size in all vertex shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpVsWaveSize", + "ExperimentName": "Vertex shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all tess control shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpTcsWaveSize", + "ExperimentName": "Tess control shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all tess eval shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpTesWaveSize", + "ExperimentName": "Tess eval shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all geometry shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpGsWaveSize", + "ExperimentName": "Geometry shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all fragment shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpFsWaveSize", + "ExperimentName": "Fragment shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all compute shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpCsWaveSize", + "ExperimentName": "Compute shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all mesh shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpMsWaveSize", + "ExperimentName": "Mesh shader wave size" + }, + { + "Description": "Disables Ray tracing shader inlining", + "Tags": [ + "Optimization" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Type": "bool", + "Name": "ExpRayTracingPipelineCompilationMode", + "ExperimentName": "Disable Ray tracing shader inlining" + }, + { + "Description": "Disable shader cache.", + "Tags": [ + "Optimization" + ], + "Type": "bool", + "Name": "ExpShaderCache", + "ExperimentName": "Disable shader cache" + }, + { + "Description": "Disable texture color compression.", + "Tags": [ + "Safety" + ], + "Type": "bool", + "Name": "ExpTextureColorCompression", + "ExperimentName": "Disable Texture Color Compression" + }, + { + "Description": "Zero unbound descriptors.", + "Tags": [ + "Safety" + ], + "Type": "bool", + "Name": "ExpZeroUnboundDescriptors", + "ExperimentName": "Zero unbound descriptors" + }, + { + "Description": "Make command allocators thread safe.", + "Tags": [ + "Safety" + ], + "Type": "bool", + "Name": "ExpThreadSafeCommandAllocator", + "ExperimentName": "Thread-safe command allocator" + }, + { + "Description": "Enable / disable vertical synchronization.", + "Tags": [ + "Safety" + ], + "ValidValues": { + "IsEnum": true, + "Name": "ExpVSyncControl", + "Values": [ + { + "Name": "ExpVSyncControlAlwaysOff", + "Value": 0, + "Description": "Force Vsync Off." + }, + { + "Name": "ExpVSyncControlAlwaysOn", + "Value": 1, + "Description": "Force Vsync On." + }, + { + "Name": "ExpVSyncControlInvalid", + "Value": 2, + "Description": "Invalid value." + } + ] + }, + "Type": "enum", + "Name": "ExpVerticalSynchronization", + "ExperimentName": "Enable / disable vertical synchronization" + } + ], + "Tags": [ + "Feature", + "Optimization", + "Safety" + ] +} \ No newline at end of file diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index 441b2528..8679fb01 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -32,6 +32,7 @@ #include "include/vk_utils.h" #include "settings/settings.h" #include "vkgcDefs.h" +#include "settings/g_experiments.h" #include "palFile.h" #include "palHashMapImpl.h" @@ -43,6 +44,7 @@ #include "devDriverServer.h" #include "protocols/ddSettingsService.h" #include "dd_settings_service.h" +#include "experimentsLoader.h" #include "../layers/include/query_dlist.h" @@ -54,18 +56,78 @@ using namespace DevDriver::SettingsURIService; using namespace Util; +#define PAL_SET_VAL_IF_EXPERIMENT_ENABLED(opt, var, val) if (pExpSettings->exp##opt.ValueOr(false)) \ +{ \ + pPalSettings->var = val; \ +} + +#define VK_SET_VAL_IF_EXPERIMENT_ENABLED(opt, var, val) if (pExpSettings->exp##opt.ValueOr(false)) \ +{ \ + m_settings.var = val; \ +} + namespace vk { +// ===================================================================================================================== +static uint32_t ExpSwsToXglSws( + ExpShaderWaveSize wsIn) +{ + uint32_t wsOut = WaveSizeAuto; + switch (wsIn) + { + case ExpWaveSizeWave64: + wsOut = 64; + break; + case ExpWaveSizeWave32: + wsOut = 32; + break; + case ExpWaveSizeAuto: + wsOut = 0; + break; + default: + wsOut = 0; + break; + } + + return wsOut; +} + +// ===================================================================================================================== +static ExpShaderWaveSize XglSwsToExpSws( + uint32_t wsIn) +{ + ExpShaderWaveSize wsOut = ExpWaveSizeInvalid; + switch (wsIn) + { + case 64: + wsOut = ExpWaveSizeWave64; + break; + case 32: + wsOut = ExpWaveSizeWave32; + break; + case 0: + wsOut = ExpWaveSizeAuto; + break; + default: + wsOut = ExpWaveSizeInvalid; + break; + } + + return wsOut; +} + // ===================================================================================================================== // Constructor for the SettingsLoader object. VulkanSettingsLoader::VulkanSettingsLoader( - Pal::IDevice* pDevice, - Pal::IPlatform* pPlatform) + Pal::IDevice* pDevice, + Pal::IPlatform* pPlatform, + ExperimentsLoader* pExpLoader) : DevDriver::SettingsBase(&m_settings, sizeof(m_settings)), m_pDevice(pDevice), - m_pPlatform(pPlatform) + m_pPlatform(pPlatform), + m_pExperimentsLoader(pExpLoader) { } @@ -135,33 +197,213 @@ void VulkanSettingsLoader::OverrideSettingsBySystemInfo() } } +// ===================================================================================================================== +// Overrides the experiments info +void VulkanSettingsLoader::OverrideDefaultsExperimentInfo() +{ + const ExpSettings* pExpSettings = m_pExperimentsLoader->GetExpSettings(); + Pal::PalPublicSettings* pPalSettings = m_pDevice->GetPublicSettings(); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(MeshShaderSupport, enableMeshShaders, false); + +#if VKI_RAY_TRACING + VK_SET_VAL_IF_EXPERIMENT_ENABLED(RayTracingSupport, enableRaytracingSupport, false); +#endif + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(Native16BitTypesSupport, enableNative16BitTypes, false); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(AmdVendorExtensions, disableAmdVendorExtensions, true); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(ComputeQueueSupport, asyncComputeQueueLimit, 0); + + if (pExpSettings->expBarrierOptimizations.ValueOr(false)) + { + pPalSettings->pwsMode = Pal::PwsMode::Disabled; + m_settings.useAcquireReleaseInterface = false; + } + + if (pExpSettings->expShaderCompilerOptimizations.ValueOr(false)) + { + m_settings.disableLoopUnrolls = true;; + } + +#if VKI_RAY_TRACING + if (pExpSettings->expAccelStructureOpt.ValueOr(false)) + { + m_settings.rtEnableTreeRebraid = RebraidTypeOff; + m_settings.rtEnableTriangleSplitting = false; + m_settings.rtEnableTopDownBuild = false; + m_settings.rtBvhBuildModeFastBuild = BvhBuildModeLinear; + m_settings.enablePairCompressionCostCheck = true; + } +#endif + + if (pExpSettings->expVsWaveSize.HasValue()) + { + m_settings.vsWaveSize = ExpSwsToXglSws(pExpSettings->expVsWaveSize.Value()); + } + + if (pExpSettings->expTcsWaveSize.HasValue()) + { + m_settings.tcsWaveSize = ExpSwsToXglSws(pExpSettings->expTcsWaveSize.Value()); + } + + if (pExpSettings->expTesWaveSize.HasValue()) + { + m_settings.tesWaveSize = ExpSwsToXglSws(pExpSettings->expTesWaveSize.Value()); + } + + if (pExpSettings->expGsWaveSize.HasValue()) + { + m_settings.gsWaveSize = ExpSwsToXglSws(pExpSettings->expGsWaveSize.Value()); + } + + if (pExpSettings->expFsWaveSize.HasValue()) + { + m_settings.fsWaveSize = ExpSwsToXglSws(pExpSettings->expFsWaveSize.Value()); + } + + if (pExpSettings->expCsWaveSize.HasValue()) + { + m_settings.csWaveSize = ExpSwsToXglSws(pExpSettings->expCsWaveSize.Value()); + } + + if (pExpSettings->expMsWaveSize.HasValue()) + { + m_settings.meshWaveSize = ExpSwsToXglSws(pExpSettings->expMsWaveSize.Value()); + } + +#if VKI_RAY_TRACING + if (pExpSettings->expRayTracingPipelineCompilationMode.ValueOr(false)) + { + m_settings.rtCompileMode = RtCompileModeIndirect; + } +#endif + + if (pExpSettings->expShaderCache.ValueOr(false)) + { + m_settings.shaderCacheMode = ShaderCacheDisable; + m_settings.usePalPipelineCaching = false; + m_settings.allowExternalPipelineCacheObject = false; + } + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(TextureColorCompression, forceEnableDcc, ForceDisableDcc); + + PAL_SET_VAL_IF_EXPERIMENT_ENABLED(ZeroUnboundDescriptors, zeroUnboundDescDebugSrd, true); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(ThreadSafeCommandAllocator, threadSafeAllocator, true); + + if (pExpSettings->expVerticalSynchronization.HasValue()) + { + ExpVSyncControl state = pExpSettings->expVerticalSynchronization.Value(); + if (state == ExpVSyncControlAlwaysOn) + { + m_settings.vSyncControl = VSyncControlAlwaysOn; + } + else if (state == ExpVSyncControlAlwaysOff) + { + m_settings.vSyncControl = VSyncControlAlwaysOff; + } + else + { + m_pExperimentsLoader->ReportVsyncState(ExpVSyncControlInvalid); + PAL_ASSERT_ALWAYS(); + } + } +} + +// ===================================================================================================================== +// Sets the final values for the experiments +void VulkanSettingsLoader::FinalizeExperiments() +{ + ExpSettings* pExpSettings = m_pExperimentsLoader->GetMutableExpSettings(); + Pal::PalPublicSettings* pPalSettings = m_pDevice->GetPublicSettings(); + + pExpSettings->expMeshShaderSupport = (m_settings.enableMeshShaders == false); + +#if VKI_RAY_TRACING + pExpSettings->expRayTracingSupport = (m_settings.enableRaytracingSupport == false); +#endif + + pExpSettings->expVariableRateShadingSupport = (m_settings.enableVariableRateShading == false); + + pExpSettings->expNative16BitTypesSupport = (m_settings.enableNative16BitTypes == false); + + pExpSettings->expAmdVendorExtensions = m_settings.disableAmdVendorExtensions; + + pExpSettings->expComputeQueueSupport = (m_settings.asyncComputeQueueLimit == 0); + + pExpSettings->expBarrierOptimizations = ((pPalSettings->pwsMode == Pal::PwsMode::Disabled) && + (m_settings.useAcquireReleaseInterface == false)); + + pExpSettings->expVsWaveSize = XglSwsToExpSws(m_settings.vsWaveSize); + + pExpSettings->expTcsWaveSize = XglSwsToExpSws(m_settings.tcsWaveSize); + + pExpSettings->expTesWaveSize = XglSwsToExpSws(m_settings.tesWaveSize); + + pExpSettings->expGsWaveSize = XglSwsToExpSws(m_settings.gsWaveSize); + + pExpSettings->expFsWaveSize = XglSwsToExpSws(m_settings.fsWaveSize); + + pExpSettings->expCsWaveSize = XglSwsToExpSws(m_settings.csWaveSize); + + pExpSettings->expMsWaveSize = XglSwsToExpSws(m_settings.meshWaveSize); + +#if VKI_RAY_TRACING + pExpSettings->expRayTracingPipelineCompilationMode = (m_settings.rtCompileMode == RtCompileModeIndirect); +#endif + + pExpSettings->expTextureColorCompression = m_settings.forceEnableDcc == ForceDisableDcc; + + pExpSettings->expZeroUnboundDescriptors = pPalSettings->zeroUnboundDescDebugSrd; + + pExpSettings->expThreadSafeCommandAllocator = m_settings.threadSafeAllocator; +} + +// ===================================================================================================================== +// Informs tools of unsupported experiments +void VulkanSettingsLoader::ReportUnsupportedExperiments( + Pal::DeviceProperties* pInfo) +{ + if (pInfo->gfxipProperties.flags.supportDoubleRate16BitInstructions == 0) + { + m_pExperimentsLoader->SaveUnsupportedExperiment(expNative16BitTypesSupportHash); + } + + if (pInfo->gfxipProperties.srdSizes.bvh == 0) + { +#if VKI_RAY_TRACING + m_pExperimentsLoader->SaveUnsupportedExperiment(expAccelStructureOptHash); + m_pExperimentsLoader->SaveUnsupportedExperiment(expRayTracingSupportHash); +#endif + } + + if (pInfo->gfxipProperties.flags.supportMeshShader == 0) + { + m_pExperimentsLoader->SaveUnsupportedExperiment(expMeshShaderSupportHash); + } + + if (pInfo->gfxipProperties.supportedVrsRates == 0) + { + m_pExperimentsLoader->SaveUnsupportedExperiment(expVariableRateShadingSupportHash); + } +} + // ===================================================================================================================== // Override defaults based on application profile. This occurs before any CCC settings or private panel settings are // applied. VkResult VulkanSettingsLoader::OverrideProfiledSettings( const VkAllocationCallbacks* pAllocCb, uint32_t appVersion, - AppProfile appProfile) + AppProfile appProfile, + Pal::DeviceProperties* pInfo) { VkResult result = VkResult::VK_SUCCESS; Pal::PalPublicSettings* pPalSettings = m_pDevice->GetPublicSettings(); - Pal::DeviceProperties* pInfo = static_cast( - pAllocCb->pfnAllocation(pAllocCb->pUserData, - sizeof(Pal::DeviceProperties), - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE)); - - if (pInfo == nullptr) { - result = VkResult::VK_ERROR_OUT_OF_HOST_MEMORY; - } - - if (result == VkResult::VK_SUCCESS) - { - memset(pInfo, 0, sizeof(Pal::DeviceProperties)); - m_pDevice->GetProperties(pInfo); // By allowing the enable/disable to be set by environment variable, any third party platform owners // can enable or disable the feature based on their internal feedback and not have to wait for a driver @@ -972,7 +1214,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp10_3) { m_settings.rtEnableCompilePipelineLibrary = false; - m_settings.rtMaxRayRecursionDepth = 2; } #if VKI_BUILD_GFX11 @@ -1112,6 +1353,16 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.forceDepthClampBasedOnZExport = true; } +#ifndef ICD_X64_BUILD + if (appProfile == AppProfile::DXVK) + { + // DXVK Tropic4/GTA4 page fault when GPL is enabled. + // It looks incorrect pipeline layout is used. Force indirect can make optimized pipeline layout compatible + // with fast-linked pipeline. + m_settings.pipelineLayoutSchemeSelectionStrategy = PipelineLayoutSchemeSelectionStrategy::ForceIndirect; + } +#endif + if (appProfile == AppProfile::AshesOfTheSingularity) { // Disable image type checking on Navi10 to avoid 2.5% loss in Ashes @@ -1359,17 +1610,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.disableSingleMipAnisoOverride = false; } - - if (appProfile == AppProfile::Enshrouded) - { -#if VKI_BUILD_GFX11 - if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) - { - } -#endif - } - - pAllocCb->pfnFree(pAllocCb->pUserData, pInfo); } return result; @@ -1412,7 +1652,10 @@ VkResult VulkanSettingsLoader::ProcessSettings( uint32_t appVersion, AppProfile* pAppProfile) { - VkResult result = VkResult::VK_SUCCESS; + VkResult result = VkResult::VK_SUCCESS; + Pal::DeviceProperties info = {}; + + m_pDevice->GetProperties(&info); // The following lines to load profile settings have been copied from g_settings.cpp static_cast(m_pDevice)->ReadSetting(pForceAppProfileEnableHashStr, @@ -1439,16 +1682,20 @@ VkResult VulkanSettingsLoader::ProcessSettings( #endif // Override defaults based on application profile - result = OverrideProfiledSettings(pAllocCb, appVersion, *pAppProfile); + result = OverrideProfiledSettings(pAllocCb, appVersion, *pAppProfile, &info); if (result == VkResult::VK_SUCCESS) { + ReportUnsupportedExperiments(&info); + // Read in the public settings from the Catalyst Control Center ReadPublicSettings(); // Read the rest of the settings from the registry ReadSettings(); + OverrideDefaultsExperimentInfo(); + // We need to override debug file paths settings to absolute paths as per system info OverrideSettingsBySystemInfo(); @@ -1656,6 +1903,10 @@ void VulkanSettingsLoader::UpdatePalSettings() pPalSettings->rpmViewsBypassMall = static_cast(m_settings.rpmViewsBypassMall); + // Allow Device Generated Commands to employ state-of-the-art CP Packet path whenever possible for optimal + // performance. Only obsolete Compute Shader path can be used otherwise. + pPalSettings->enableExecuteIndirectPacket = true; + // Controls PWS enable mode: disabled, fully enabled or partially enabled. Only takes effect if HW supports PWS and // Acq-rel barriers if (m_settings.useAcquireReleaseInterface) @@ -1710,6 +1961,9 @@ void VulkanSettingsLoader::FinalizeSettings( } GenerateSettingHash(); + + // Note this should be the last thing done when we finalize so we can capture any changes: + FinalizeExperiments(); } // ===================================================================================================================== diff --git a/icd/settings/settings.h b/icd/settings/settings.h index 1a3b74c5..b74e15ee 100644 --- a/icd/settings/settings.h +++ b/icd/settings/settings.h @@ -48,17 +48,20 @@ namespace Pal { class IDevice; class IPlatform; +struct DeviceProperties; } namespace vk { +class ExperimentsLoader; + // ===================================================================================================================== // This class is responsible for loading and processing the Vulkan runtime settings structure encapsulated in the Vulkan // Settings Loader object. class VulkanSettingsLoader : public DevDriver::SettingsBase { public: - explicit VulkanSettingsLoader(Pal::IDevice* pDevice, Pal::IPlatform* pPlatform); + explicit VulkanSettingsLoader(Pal::IDevice* pDevice, Pal::IPlatform* pPlatform, ExperimentsLoader* pExpLoader); virtual ~VulkanSettingsLoader(); Pal::Result Init(); @@ -102,10 +105,17 @@ class VulkanSettingsLoader : public DevDriver::SettingsBase VkResult OverrideProfiledSettings( const VkAllocationCallbacks* pAllocCb, uint32_t appVersion, - AppProfile appProfile); + AppProfile appProfile, + Pal::DeviceProperties* pInfo); + + void ReportUnsupportedExperiments(Pal::DeviceProperties* pInfo); void OverrideSettingsBySystemInfo(); + void OverrideDefaultsExperimentInfo(); + + void FinalizeExperiments(); + void DumpAppProfileChanges( AppProfile appProfile); @@ -113,6 +123,7 @@ class VulkanSettingsLoader : public DevDriver::SettingsBase Pal::IDevice* m_pDevice; Pal::IPlatform* m_pPlatform; + ExperimentsLoader* m_pExperimentsLoader; RuntimeSettings m_settings; Util::MetroHash::Hash m_settingsHash; }; diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index b7fcccd9..0f88fd4f 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -2906,7 +2906,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": 1 + "Default": 31 }, "Type": "uint32", "Scope": "Driver" @@ -3203,12 +3203,42 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": 1.3 + "Default": 1.15 }, "Type": "float", "Name": "RtTriangleSplittingFactor", "Scope": "Driver" }, + { + "Description": "If RtEnableTriangleSplitting is enabled, this setting will limit the maximum number of splits per triangle. A value=0 disables the setting.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": 0 + }, + "Type": "uint32", + "Name": "RtTriangleSplittingBudgetPerTriangle", + "Scope": "Driver" + }, + { + "Description": "If RtEnableTriangleSplitting is enabled, this factor will affect the priority in triangle splitting.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": 1.0 + }, + "Type": "float", + "Name": "RtTriangleSplittingPriority", + "Scope": "Driver" + }, { "Name": "RtEnableMortonCode30", "Description": "Enable Morton Code 30 bits", @@ -3923,6 +3953,21 @@ "Name": "RtEnableBuildParallel", "Scope": "Driver" }, + { + "Description": "When the LBVH builder is selected, enable the Fast LBVH path.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Name": "RtEnableFastLBVH", + "Scope": "Driver" + }, { "Description": "Waves per SIMD to launch for parallel build. 0 chooses the default.", "Tags": [ @@ -3980,21 +4025,6 @@ "Default": 32 } }, - { - "Name": "RtEnableAccelerationStructureScratchMemoryDump", - "Description": "Dumps scratch memory from acceleration structures. Written to the directory specified by BaseLogDirPath.", - "Tags": [ - "Ray Tracing" - ], - "BuildTypes": [ - "VKI_RAY_TRACING" - ], - "Defaults": { - "Default": false - }, - "Type": "bool", - "Scope": "Driver" - }, { "Name": "RtEnableBuildAccelStructStats", "Description": "Dump built acceleration stats. (Pending implementation)", @@ -5318,21 +5348,6 @@ "Name": "RtIndirectVgprLimit", "Scope": "Driver" }, - { - "Description": "Acceleration structures used in TraceRay are tracked for dumping purposes. Disables tracking during Build calls.", - "Tags": [ - "Ray Tracing" - ], - "BuildTypes": [ - "VKI_RAY_TRACING" - ], - "Defaults": { - "Default": false - }, - "Type": "bool", - "Name": "EnableTraceRayAccelStructTracking", - "Scope": "Driver" - }, { "Description": "Force rebuild for acceleration structure updates.", "Tags": [ @@ -8743,9 +8758,6 @@ "Tags": [ "Debugging" ], - "BuildTypes": [ - "DEBUG" - ], "Defaults": { "Default": false }, @@ -8758,9 +8770,6 @@ "Tags": [ "Debugging" ], - "BuildTypes": [ - "DEBUG" - ], "Defaults": { "Default": "" }, @@ -8773,9 +8782,6 @@ "Tags": [ "Debugging" ], - "BuildTypes": [ - "DEBUG" - ], "Defaults": { "Default": 0 }, @@ -8945,7 +8951,7 @@ "Name": "ReportSuboptimalPresentAsOutOfDate" }, { - "Name": "ExportNVComputeShaderDerivatives", + "Name": "ExportNvComputeShaderDerivatives", "Description": "Export extension NV_compute_shader_derivatives", "Tags": [ "General" @@ -8956,6 +8962,18 @@ "Type": "bool", "Scope": "Driver" }, + { + "Name": "ExportNvDeviceGeneratedCommands", + "Description": "Export extension NV_device_generated_commands and NV_device_generated_commands_compute", + "Tags": [ + "General" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "ExportImageCompressionControl", "Description": "Export extension VK_EXT_image_compression_control",