From 02e867e81626b9566c5cc24f134ec0760e93a94a Mon Sep 17 00:00:00 2001 From: wenqinli Date: Thu, 18 May 2023 23:46:17 +0800 Subject: [PATCH] Update xgl from commit 2aeb0b25 * Hook up resources for ray history traces via PAL trace * Initial changes for VK_EXT_device_address_binding_report * Update PAL Version in XGL 796 * Update Khronos Vulkan Headers to 1.3.250 * Refine pipeline cache in GPL * Fix push descriptor spilled to spill table * The api hash dump name should be set false as default * Support new dynamic state vertexBufferCount * Refine Pipeline dump * Check whether the image view of the color/depth/stencil attachment is a null handle * Fix GPL fast link fail on nv2x * Link enableImplicitInvariantExports to driver-level disableImplicitInvariantExports * [DebugPrintf]Fixed the case to read entryHeader * [DebugPrintf]Fixed the pPtr header * Support BinaryInfo in ShaderEarlyCompileInfo for GPL caching * Enable PalDeveloperCb immediately after the Pal Platform is created * Fix ColorspaceHelper's lookup table so it only reports formats that are legal * Bump Gpurt Version to 33 * Add astc hdr support * [CTS_Next] dEQP-VK.pipeline.*.extended_dynamic_state.*.large_static_rasterization_samples_* - tests crash * [CTS_Next] dEQP-VK.pipeline.*.extended_dynamic_state.*color_blend_dual* - tests fail * Update supported CTS to 1.3.3.1 * Expose VK_EXT_mutable_descriptor_type * Add EnableFusedInstanceNode * Navi3x tuning for Rage2 * Add appProfile for SOTTR * Fix Noisy Asserts & Alerts * Correlation information does not come for some binds * Mutable descriptor type, fix array element pointer calculation bug * Remove descriptor buffer memory type for Images * Potential crash when calculating pipeline cache id * Force initialization of disableImplicitInvariantExports in LLPC builds * Expose disableImplicitInvariantExports to LLPC * Switch to Parallel Build Path * Add sleep to help wait for debugger to attach * Corruption Observed in Unigine Heaven/Unigine Valley Using Zink * SE 5 Radeon Relive TDR * HDR is not supported on Doom Eternal * Config emulatedRtIp in raytracing device init * Fix issues in the DebugPrintf * [Navi31][TotalWar:Rome Remastered]Random corruption in gameplay * Yuzu - Metroid Prime Remastered: Corruption & Crash on game load --- cmake/XglVersions.cmake | 4 +- icd/Loader/LunarG/Lnx/amd-icd.json | 4 +- icd/api/app_profile.cpp | 31 + icd/api/app_shader_optimizer.cpp | 3 +- .../{Navi31 => generic}/Rage2/profile.json | 0 icd/api/color_space_helper.cpp | 6 +- icd/api/compiler_solution.cpp | 6 +- icd/api/compiler_solution_llpc.cpp | 110 +-- icd/api/debug_printf.cpp | 156 ++-- icd/api/entry.cpp | 1 + icd/api/gpumemory_event_handler.cpp | 736 ++++++++++++++---- icd/api/graphics_pipeline_common.cpp | 15 +- icd/api/include/app_profile.h | 2 + icd/api/include/compiler_solution.h | 21 +- icd/api/include/compiler_solution_llpc.h | 16 +- icd/api/include/gpumemory_event_handler.h | 176 ++++- .../khronos/sdk-1.3/vulkan/vulkan_core.h | 51 +- icd/api/include/log.h | 7 +- icd/api/include/pipeline_compiler.h | 7 +- icd/api/include/vk_cmdbuffer.h | 11 +- icd/api/include/vk_conv.h | 83 +- icd/api/include/vk_deferred_operation.h | 2 +- icd/api/include/vk_defines.h | 2 + icd/api/include/vk_device.h | 9 +- icd/api/include/vk_extensions.h | 2 + .../include/vk_graphics_pipeline_library.h | 1 + icd/api/include/vk_physical_device.h | 17 - icd/api/include/vk_pipeline.h | 3 +- icd/api/include/vk_pipeline_cache.h | 19 +- icd/api/include/vk_shader.h | 2 +- icd/api/include/vk_utils.h | 3 + icd/api/internal_mem_mgr.cpp | 45 +- icd/api/pipeline_compiler.cpp | 301 +++---- icd/api/raytrace/ray_tracing_device.cpp | 148 +++- icd/api/raytrace/ray_tracing_device.h | 39 + icd/api/raytrace/vk_ray_tracing_pipeline.cpp | 15 +- icd/api/raytrace/vk_ray_tracing_pipeline.h | 3 +- icd/api/strings/entry_points.txt | 1 + icd/api/strings/extensions.txt | 2 + icd/api/vk_cmdbuffer.cpp | 514 ++++++------ icd/api/vk_compute_pipeline.cpp | 8 +- icd/api/vk_conv.cpp | 18 + icd/api/vk_descriptor_pool.cpp | 5 +- icd/api/vk_descriptor_set.cpp | 27 +- icd/api/vk_device.cpp | 105 ++- icd/api/vk_dispatch.cpp | 1 + icd/api/vk_graphics_pipeline.cpp | 13 +- icd/api/vk_graphics_pipeline_library.cpp | 22 +- icd/api/vk_image.cpp | 3 + icd/api/vk_instance.cpp | 15 +- icd/api/vk_memory.cpp | 13 +- icd/api/vk_physical_device.cpp | 106 +-- icd/api/vk_pipeline.cpp | 9 +- icd/api/vk_pipeline_cache.cpp | 203 +---- icd/api/vk_pipeline_layout.cpp | 37 +- icd/api/vk_query.cpp | 59 +- icd/api/vk_queue.cpp | 2 - icd/api/vk_shader.cpp | 2 +- icd/api/vk_utils.cpp | 36 + .../shaders/bc3-encode-hlsl/bcn_common_api.h | 17 - .../include/vk_layer_switchable_graphics.h | 3 +- icd/res/ver.h | 8 +- icd/settings/settings.cpp | 36 +- icd/settings/settings_xgl.json | 89 ++- 64 files changed, 2178 insertions(+), 1233 deletions(-) rename icd/api/appopt/shader_profiles/llpc/gfxIp11_0/{Navi31 => generic}/Rage2/profile.json (100%) diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake index f1a7ae0b..3248547c 100644 --- a/cmake/XglVersions.cmake +++ b/cmake/XglVersions.cmake @@ -28,7 +28,7 @@ include_guard() # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -set(ICD_PAL_CLIENT_MAJOR_VERSION "792") +set(ICD_PAL_CLIENT_MAJOR_VERSION "796") # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports. @@ -37,7 +37,7 @@ set(ICD_GPUOPEN_CLIENT_MAJOR_VERSION "42") #if VKI_RAY_TRACING # This will become the value of GPURT_CLIENT_INTERFACE_MAJOR_VERSION if VKI_RAY_TRACING=1. # It describes the interface version of the GpuRT shared module that the ICD supports. -set(ICD_GPURT_CLIENT_MAJOR_VERSION "32") +set(ICD_GPURT_CLIENT_MAJOR_VERSION "33") #endif # This will become the value of LLPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_LLPC=1. diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index 56a0a745..b7b08e15 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.246" + "api_version": "1.3.250" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.246", + "api_version": "1.3.250", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/app_profile.cpp b/icd/api/app_profile.cpp index 783b914d..25b90abf 100644 --- a/icd/api/app_profile.cpp +++ b/icd/api/app_profile.cpp @@ -207,6 +207,12 @@ constexpr AppProfilePatternEntry AppNameSeriousSam4Win = "serious sam 4 - 64bit" }; +constexpr AppProfilePatternEntry AppNameRomeRemasteredLinux = +{ + PatternAppNameLower, + "rome" +}; + constexpr AppProfilePatternEntry AppEngineSedp = { PatternEngineNameLower, @@ -375,6 +381,13 @@ constexpr AppProfilePatternEntry AppEngineSaschaWillemsExamples = "vulkanexample" }; +//steam version of shadow of tomb raider +constexpr AppProfilePatternEntry AppNameSOTTR = +{ + PatternAppNameLower, + "sottr.exe" +}; + #if VKI_RAY_TRACING constexpr AppProfilePatternEntry AppEngineVKD3D = { @@ -902,6 +915,15 @@ AppProfilePattern AppPatternTable[] = } }, + { + AppProfile::RomeRemastered, + { + AppNameRomeRemasteredLinux, + AppEngineFeral3D, + PatternEnd + } + }, + { AppProfile::ThreeKingdoms, { @@ -1161,6 +1183,15 @@ AppProfilePattern AppPatternTable[] = } }, + { + AppProfile::SOTTR, + { + AppNameSOTTR, + AppEngineDXVK, + PatternEnd + } + }, + #if VKI_RAY_TRACING { AppProfile::ControlDX12, diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp index b989ed56..9ed4744c 100644 --- a/icd/api/app_shader_optimizer.cpp +++ b/icd/api/app_shader_optimizer.cpp @@ -167,7 +167,8 @@ void ShaderOptimizer::CalculateMatchingProfileEntriesHash( pHasher->Update(shaderAction.pipelineShader); pHasher->Update(shaderAction.shaderCreate); - if (shaderAction.shaderReplace.pCode != nullptr) + if (shaderAction.shaderCreate.apply.shaderReplaceEnabled && + (shaderAction.shaderReplace.pCode != nullptr)) { pHasher->Update( static_cast(shaderAction.shaderReplace.pCode), diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Rage2/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/Rage2/profile.json similarity index 100% rename from icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Rage2/profile.json rename to icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/Rage2/profile.json diff --git a/icd/api/color_space_helper.cpp b/icd/api/color_space_helper.cpp index 9214cd76..93a36a44 100644 --- a/icd/api/color_space_helper.cpp +++ b/icd/api/color_space_helper.cpp @@ -51,12 +51,12 @@ struct LookupDefines const LookupDefines colorspaceLookup[] = { - { Pal::ScreenColorSpace::CsSrgb, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, FmtSupport::Fmt_All }, + { Pal::ScreenColorSpace::CsSrgb, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, FmtSupport::Fmt_8bpc }, { Pal::ScreenColorSpace::CsBt709, VK_COLOR_SPACE_BT709_NONLINEAR_EXT, FmtSupport::Fmt_All }, { Pal::ScreenColorSpace::TfHlg, VK_COLOR_SPACE_HDR10_HLG_EXT, FmtSupport::Fmt_KnownHDR }, - { Pal::ScreenColorSpace::TfPq2084, VK_COLOR_SPACE_HDR10_ST2084_EXT, FmtSupport::Fmt_KnownHDR }, + { Pal::ScreenColorSpace::TfPq2084, VK_COLOR_SPACE_HDR10_ST2084_EXT, FmtSupport::Fmt_10bpc }, { Pal::ScreenColorSpace::TfDolbyVision, VK_COLOR_SPACE_DOLBYVISION_EXT, FmtSupport::Fmt_8bpc_unorm }, - { Pal::ScreenColorSpace::CsBt2020, VK_COLOR_SPACE_BT2020_LINEAR_EXT, FmtSupport::Fmt_KnownHDR }, + { Pal::ScreenColorSpace::CsBt2020, VK_COLOR_SPACE_BT2020_LINEAR_EXT, FmtSupport::Fmt_10bpc }, { Pal::ScreenColorSpace::CsAdobe, VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT, FmtSupport::Fmt_All }, { Pal::ScreenColorSpace::CsDciP3, VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT, FmtSupport::Fmt_All }, { Pal::ScreenColorSpace::CsScrgb, VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT, FmtSupport::Fmt_16bpc_sfloat }, diff --git a/icd/api/compiler_solution.cpp b/icd/api/compiler_solution.cpp index d9e8414e..d41f9c29 100644 --- a/icd/api/compiler_solution.cpp +++ b/icd/api/compiler_solution.cpp @@ -58,9 +58,9 @@ CompilerSolution::~CompilerSolution() // ===================================================================================================================== // Initialize CompilerSolution class VkResult CompilerSolution::Initialize( - Vkgc::GfxIpVersion gfxIp, - Pal::GfxIpLevel gfxIpLevel, - Vkgc::ICache* pCache) + Vkgc::GfxIpVersion gfxIp, + Pal::GfxIpLevel gfxIpLevel, + PipelineBinaryCache* pCache) { m_gfxIp = gfxIp; m_gfxIpLevel = gfxIpLevel; diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index 72d660a4..9076938b 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -59,18 +59,21 @@ CompilerSolutionLlpc::~CompilerSolutionLlpc() // ===================================================================================================================== // Initialize CompilerSolutionLlpc class VkResult CompilerSolutionLlpc::Initialize( - Vkgc::GfxIpVersion gfxIp, - Pal::GfxIpLevel gfxIpLevel, - Vkgc::ICache* pCache) + Vkgc::GfxIpVersion gfxIp, + Pal::GfxIpLevel gfxIpLevel, + PipelineBinaryCache* pCache) { const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings(); - Vkgc::ICache* pInternalCache = pCache; - if (settings.shaderCacheMode == ShaderCacheDisable) + Vkgc::ICache* pInternalCache = nullptr; + if (pCache != nullptr) { - pInternalCache = nullptr; + if (settings.shaderCacheMode != ShaderCacheDisable) + { + pInternalCache = pCache->GetCacheAdapter(); + } } - VkResult result = CompilerSolution::Initialize(gfxIp, gfxIpLevel, pInternalCache); + VkResult result = CompilerSolution::Initialize(gfxIp, gfxIpLevel, pCache); if (result == VK_SUCCESS) { @@ -91,32 +94,12 @@ void CompilerSolutionLlpc::Destroy() } } -// ===================================================================================================================== -// Get size of shader cache object -size_t CompilerSolutionLlpc::GetShaderCacheSize( - PipelineCompilerType cacheType) -{ - VK_NEVER_CALLED(); - return 0; -} - -// ===================================================================================================================== -// Creates shader cache object. -VkResult CompilerSolutionLlpc::CreateShaderCache( - const void* pInitialData, - size_t initialDataSize, - void* pShaderCacheMem, - uint32_t expectedEntries, - ShaderCache* pShaderCache) -{ - return VK_ERROR_INITIALIZATION_FAILED; -} - // ===================================================================================================================== // Builds shader module from SPIR-V binary code. VkResult CompilerSolutionLlpc::BuildShaderModule( const Device* pDevice, VkShaderModuleCreateFlags flags, + VkShaderModuleCreateFlags internalShaderFlags, size_t codeSize, const void* pCode, const bool adaptForFastLink, @@ -142,7 +125,7 @@ VkResult CompilerSolutionLlpc::BuildShaderModule( pPipelineCompiler->ApplyPipelineOptions(pDevice, 0, &moduleInfo.options.pipelineOptions); #if VKI_RAY_TRACING - if ((flags & VK_SHADER_MODULE_RAY_TRACING_INTERNAL_SHADER_BIT) != 0) + if ((internalShaderFlags & VK_INTERNAL_SHADER_FLAGS_RAY_TRACING_INTERNAL_SHADER_BIT) != 0) { #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 55 moduleInfo.options.pipelineOptions.internalRtShaders = true; @@ -183,11 +166,6 @@ void CompilerSolutionLlpc::FreeShaderModule(ShaderModuleHandle* pShaderModule) auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); pInstance->FreeMem(pShaderModule->pLlpcShaderModule); - - if (pShaderModule->elfPackage.codeSize > 0) - { - pInstance->FreeMem(const_cast(pShaderModule->elfPackage.pCode)); - } } // ===================================================================================================================== @@ -377,36 +355,72 @@ VkResult CompilerSolutionLlpc::CreateGraphicsPipelineBinary( // Build ElfPackage for a specific shader module based on pipeine information VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( const Device* pDevice, + PipelineCache* pPipelineCache, const ShaderStage stage, GraphicsPipelineBinaryCreateInfo* pCreateInfo, void* pPipelineDumpHandle, ShaderModuleHandle* pShaderModule) { VkResult result = VK_SUCCESS; + Util::MetroHash::Hash cacheId = {}; - // Build the LLPC pipeline - Llpc::GraphicsPipelineBuildOut pipelineOut = {}; - - Vkgc::UnlinkedShaderStage unlinkedStage = UnlinkedStageCount; - - // Belong to vertexProcess stage before fragment - if (stage < ShaderStage::ShaderStageFragment) + bool hitCache = false; + if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCache() != nullptr)) { - unlinkedStage = UnlinkedShaderStage::UnlinkedStageVertexProcess; + Vkgc::BinaryData elfPackage = {}; + Util::MetroHash128 hasher; + hasher.Update(pCreateInfo->libraryHash[stage]); + hasher.Update(m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash()); + hasher.Finalize(cacheId.bytes); + auto pAppCache = pPipelineCache->GetPipelineCache(); + hitCache = (pAppCache->LoadPipelineBinary(&cacheId, &elfPackage.codeSize, &elfPackage.pCode) + == Util::Result::Success); + pShaderModule->elfPackage = elfPackage; } - else if (stage == ShaderStage::ShaderStageFragment) + + if (hitCache == false) { - unlinkedStage = UnlinkedShaderStage::UnlinkedStageFragment; - } + // Build the LLPC pipeline + Llpc::GraphicsPipelineBuildOut pipelineOut = {}; + Vkgc::UnlinkedShaderStage unlinkedStage = UnlinkedStageCount; - auto llpcResult = m_pLlpc->buildGraphicsShaderStage( + // Belong to vertexProcess stage before fragment + if (stage < ShaderStage::ShaderStageFragment) + { + unlinkedStage = UnlinkedShaderStage::UnlinkedStageVertexProcess; + } + else if (stage == ShaderStage::ShaderStageFragment) + { + unlinkedStage = UnlinkedShaderStage::UnlinkedStageFragment; + } + + auto llpcResult = m_pLlpc->buildGraphicsShaderStage( &pCreateInfo->pipelineInfo, &pipelineOut, unlinkedStage, pPipelineDumpHandle); - if (llpcResult == Vkgc::Result::Success) + if (llpcResult == Vkgc::Result::Success) + { + pShaderModule->elfPackage = pipelineOut.pipelineBin; + if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCache() != nullptr)) + { + pPipelineCache->GetPipelineCache()->StorePipelineBinary( + &cacheId, pipelineOut.pipelineBin.codeSize, pipelineOut.pipelineBin.pCode); + } + } + else + { + + result = (llpcResult == Vkgc::Result::ErrorOutOfMemory) ? + VK_ERROR_OUT_OF_HOST_MEMORY : VK_ERROR_INITIALIZATION_FAILED; + + } + } + + if (result == VK_SUCCESS) { - pShaderModule->elfPackage = pipelineOut.pipelineBin; + pCreateInfo->earlyElfPackage[stage] = pShaderModule->elfPackage; + pCreateInfo->earlyElfPackageHash[stage] = cacheId; } return result; diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp index ef5f432c..13ce0b3e 100644 --- a/icd/api/debug_printf.cpp +++ b/icd/api/debug_printf.cpp @@ -114,23 +114,19 @@ void DebugPrintf::BindPipeline( srdInfo.range = m_printfMemory.Size(); pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds(1, &srdInfo, pTable); m_frame = 0; - } - } - - if (m_state == MemoryAllocated) - { - const Pal::uint32* pEntry = reinterpret_cast(&tableVa); - pCmdBuffer->CmdSetUserData(static_cast(bindPoint), userDataOffset, 1, pEntry); + const Pal::uint32* pEntry = reinterpret_cast(&tableVa); + pCmdBuffer->CmdSetUserData(static_cast(bindPoint), userDataOffset, 1, pEntry); - m_parsedFormatStrings.Reset(); - for (auto it = pPipeline->GetFormatStrings().Begin(); it.Get() != nullptr; it.Next()) - { - bool found = true; - PrintfSubSection* pSubSections = nullptr; - m_parsedFormatStrings.FindAllocate(it.Get()->key, &found, &pSubSections); - VK_ASSERT(found == false); - pSubSections->Reserve(1); - ParseFormatStringsToSubSection(it.Get()->value.printStr, pSubSections); + m_parsedFormatStrings.Reset(); + for (auto it = pPipeline->GetFormatStrings().Begin(); it.Get() != nullptr; it.Next()) + { + bool found = true; + PrintfSubSection* pSubSections = nullptr; + m_parsedFormatStrings.FindAllocate(it.Get()->key, &found, &pSubSections); + VK_ASSERT(found == false); + pSubSections->Reserve(1); + ParseFormatStringsToSubSection(it.Get()->value.printStr, pSubSections); + } } } } @@ -169,7 +165,8 @@ Pal::Result DebugPrintf::PostQueueProcess( uint64_t bufferSize = 0; uint32_t* pPrintBuffer = nullptr; uint32_t* pPtr = nullptr; - uint64_t maxBufferDWSize = m_printfMemory.Size() >> 2; + constexpr uint32_t bufferHeaderSize = 4; + uint64_t maxBufferDWSize = (m_printfMemory.Size() >> 2) - bufferHeaderSize; if (palResult == Pal::Result::Success) { // Buffer Header is 4 dword {BufferOffset_Loword, BufferOffset_Hiword, rerv0, rerv1}; @@ -179,76 +176,83 @@ Pal::Result DebugPrintf::PostQueueProcess( pPtr += 2; bufferSize = (static_cast(bufferSizeHigh) << 32) | static_cast(bufferSizeLower); bufferSize = Util::Min(bufferSize, maxBufferDWSize); + if (bufferSize > 0) + { + pPrintBuffer = static_cast(pDevice->VkInstance()->AllocMem( + bufferSize * sizeof(uint32_t), 4, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)); - pPrintBuffer = static_cast(pDevice->VkInstance()->AllocMem( - bufferSize * sizeof(uint32_t), 4, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)); - - memcpy(pPrintBuffer, pPtr, bufferSize * 4); + memcpy(pPrintBuffer, pPtr, bufferSize * 4); + } m_printfMemory.Unmap(deviceIdx); - const auto& formatStrings = m_pPipeline->GetFormatStrings(); - const uint32_t entryHeaderSize = 2; - uint64_t decodeOffset = 0; - PrintfString outputBufferStr(nullptr); - outputBufferStr.Reserve(10); - Vector outputDecodedSpecifiers(nullptr); - outputDecodedSpecifiers.Reserve(5); - while (decodeOffset < bufferSize) + if (bufferSize > 0) { - // Decode entry - uint32_t entryHeaderLow = *pPtr++; - uint32_t entryHeaderHigh = *pPtr++; - uint64_t entryHeader = ((uint64_t)(entryHeaderHigh) << 32) | uint64_t(entryHeaderLow); - // 64 bit header {[0:15], [16:63]} entrySize,hash value for the string - uint64_t entrySize = (entryHeader & 65535); - uint64_t entryHashValue = entryHeader >> 16; - - decodeOffset += entryHeaderSize; - // Check hash value in the entry valid - auto pEntry = formatStrings.FindKey(entryHashValue); - if (pEntry == nullptr) + const auto& formatStrings = m_pPipeline->GetFormatStrings(); + const uint32_t entryHeaderSize = 2; + uint64_t decodeOffset = 0; + PrintfString outputBufferStr(nullptr); + outputBufferStr.Reserve(10); + Vector outputDecodedSpecifiers(nullptr); + outputDecodedSpecifiers.Reserve(5); + // Set pPtr point to the head of the system memory + pPtr = pPrintBuffer; + while ((bufferSize - decodeOffset) > 1) { - break; - } + // Decode entry + uint32_t entryHeaderLow = *pPtr++; + uint32_t entryHeaderHigh = *pPtr++; + uint64_t entryHeader = ((uint64_t)(entryHeaderHigh) << 32) | uint64_t(entryHeaderLow); + // 64 bit header {[0:15], [16:63]} entrySize,hash value for the string + uint64_t entryValuesSize = (entryHeader & 65535) - entryHeaderSize; + uint64_t entryHashValue = entryHeader >> 16; + + decodeOffset += entryHeaderSize; + // Check hash value in the entry valid and if there is space to decoded entry values + auto pEntry = formatStrings.FindKey(entryHashValue); + if ((pEntry == nullptr) || ((bufferSize - decodeOffset) < entryValuesSize)) + { + break; + } - const PrintfString& formatString = pEntry->printStr; - const PrintfBit& bitPos = pEntry->bit64s; - PrintfSubSection* pSubSections = m_parsedFormatStrings.FindKey(entryHashValue); - int initSize = bitPos.size() - outputDecodedSpecifiers.size(); - for (int i = 0; i < initSize; ++i) - { - outputDecodedSpecifiers.PushBack(nullptr); - } + const PrintfString& formatString = pEntry->printStr; + const PrintfBit& bitPos = pEntry->bit64s; + PrintfSubSection* pSubSections = m_parsedFormatStrings.FindKey(entryHashValue); + int initSize = bitPos.size() - outputDecodedSpecifiers.size(); + for (int i = 0; i < initSize; ++i) + { + outputDecodedSpecifiers.PushBack(nullptr); + } - // Get printf output variable in dword size - unsigned outputsInDwords = 0; - uint64_t outputVar; - for (uint32_t varIndex = 0; varIndex < bitPos.size(); varIndex++) - { - outputVar = *pPtr++; - outputsInDwords++; - bool is64bit = bitPos[varIndex]; - if (is64bit) + // Get printf output variable in dword size + unsigned outputsInDwords = 0; + uint64_t outputVar; + for (uint32_t varIndex = 0; varIndex < bitPos.size(); varIndex++) { - uint64_t hiDword = *pPtr++; - outputVar = (hiDword << 32) | outputVar; + outputVar = *pPtr++; outputsInDwords++; - } + bool is64bit = bitPos[varIndex]; + if (is64bit) + { + uint64_t hiDword = *pPtr++; + outputVar = (hiDword << 32) | outputVar; + outputsInDwords++; + } - DecodeSpecifier(formatString, - outputVar, - is64bit, - pSubSections, - varIndex, - &outputDecodedSpecifiers[varIndex]); + DecodeSpecifier(formatString, + outputVar, + is64bit, + pSubSections, + varIndex, + &outputDecodedSpecifiers[varIndex]); + } + OutputBufferString(formatString, *pSubSections, &outputBufferStr); + decodeOffset += outputsInDwords; } - OutputBufferString(formatString, *pSubSections, &outputBufferStr); - decodeOffset += outputsInDwords; + WriteToFile(outputBufferStr); + pDevice->VkInstance()->FreeMem(pPrintBuffer); + m_frame++; } - WriteToFile(outputBufferStr); - pDevice->VkInstance()->FreeMem(pPrintBuffer); - m_frame++; } return palResult; @@ -276,7 +280,7 @@ void DebugPrintf::WriteToFile( const char* fileBeginPostfix =" Begin ========================\n"; const char* fileEnd = "========================= Session End ========================\n"; file.Write(fileBeginPrefix, strlen(fileBeginPrefix)); - file.Write(fileName.Data(), fileName.NumElements()); + file.Write(fileName.Data(), strlen(fileName.Data())); file.Write(fileBeginPostfix, strlen(fileBeginPostfix)); result = file.Write(outputBuffer.Data(), outputBuffer.size()); if (result == Util::Result::Success) @@ -316,7 +320,7 @@ PrintfString DebugPrintf::GetFileName( AppendPrintfString(&fName, pDumpFolder, strlen(pDumpFolder)); AppendPrintfString(&fName, "/", 1); AppendPrintfString(&fName, fileName, strlen(fileName)); - AppendPrintfString(&fName, ".txt", 4); + AppendPrintfString(&fName, ".txt\0", 5); } return fName; } diff --git a/icd/api/entry.cpp b/icd/api/entry.cpp index 781cb33a..d59d93f7 100644 --- a/icd/api/entry.cpp +++ b/icd/api/entry.cpp @@ -102,6 +102,7 @@ VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer( ApiCmdBuffer::ObjectFromHandle(cmdBuffer)->BindIndexBuffer( buffer, offset, + VK_WHOLE_SIZE, indexType); } diff --git a/icd/api/gpumemory_event_handler.cpp b/icd/api/gpumemory_event_handler.cpp index 3a5c083b..d2d8a2c5 100644 --- a/icd/api/gpumemory_event_handler.cpp +++ b/icd/api/gpumemory_event_handler.cpp @@ -33,12 +33,18 @@ #include "include/vk_instance.h" #include "include/vk_device.h" +#include "palIntrusiveListImpl.h" #include "palHashMapImpl.h" +#include "palHashSetImpl.h" #include "palVectorImpl.h" namespace vk { +// Alloc, suballoc and bind entries have NullHandle assigned as object handle, +// if the correlation information was not provided yet to gpu memory event handler +constexpr uint64_t NullHandle = 0; + // ===================================================================================================================== GpuMemoryEventHandler::GpuMemoryEventHandler(Instance* pInstance) : @@ -47,12 +53,16 @@ GpuMemoryEventHandler::GpuMemoryEventHandler(Instance* pInstance) m_allocationHashMap(32, pInstance->Allocator()), m_vulkanSubAllocationHashMap(32, pInstance->Allocator()), m_palSubAllocationHashMap(32, pInstance->Allocator()), + m_bindHashMap(32, pInstance->Allocator()), + m_deviceHashSet(32, pInstance->Allocator()), m_memoryObjectId(0), - m_memoryEventEnables(0) + m_deviceCount(0) { m_allocationHashMap.Init(); m_vulkanSubAllocationHashMap.Init(); m_palSubAllocationHashMap.Init(); + m_deviceHashSet.Init(); + m_bindHashMap.Init(); } // ===================================================================================================================== @@ -86,6 +96,15 @@ void GpuMemoryEventHandler::Destroy() PAL_ALERT_MSG(m_vulkanSubAllocationHashMap.GetNumEntries() != 0, "Vulkan suballocations were not freed."); PAL_ALERT_MSG(m_palSubAllocationHashMap.GetNumEntries() != 0, "Pal suballocations were not freed."); + for (auto iter = m_bindHashMap.Begin(); iter.Get() != nullptr; iter.Next()) + { + const BindDataList& bindDataList = iter.Get()->value; + if (bindDataList.IsEmpty() == false) + { + PAL_ALERT_MSG(bindDataList.IsEmpty() == false, "Memory binds map is not empty."); + } + } + Util::Destructor(this); m_pInstance->FreeMem(this); @@ -113,7 +132,10 @@ void GpuMemoryEventHandler::PalDeveloperCallback( if (exists == false) { // Store the allocation information - pAllocationData->allocationData = *pGpuMemoryData; + pAllocationData->allocationData = *pGpuMemoryData; + pAllocationData->objectHandle = NullHandle; + pAllocationData->objectType = VK_OBJECT_TYPE_UNKNOWN; + pAllocationData->reportedToDeviceMemoryReport = false; // If this is a Pal internal allocation that is not suballocated report it to device_memory_report now if ((pAllocationData->allocationData.flags.isClient == 0) && // Pal internal, not Vulkan @@ -121,21 +143,18 @@ void GpuMemoryEventHandler::PalDeveloperCallback( (pAllocationData->allocationData.flags.buddyAllocated == 0) && // Buddy allocator is suballocated (pAllocationData->allocationData.flags.isExternal == 0)) // vkCreateMemory handles external { - m_callbacksLock.LockForRead(); - auto iter = m_callbacks.Begin(); - uint32_t heapIndex = 0; - - if (iter.IsValid()) - { - auto*const pPhysicalDevice = (iter.Get().pDevice)->VkPhysicalDevice(DefaultDeviceIndex); - bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap, &heapIndex); - VK_ASSERT(validHeap); - } - m_callbacksLock.UnlockForRead(); - - // The instance is the default Vulkan object for allocations not specifically tracked otherwise. - pAllocationData->objectHandle = Instance::IntValueFromHandle(Instance::FromObject(m_pInstance)); - pAllocationData->objectType = VK_OBJECT_TYPE_INSTANCE; + // This is a Pal internal allocation that is not suballocated report it to device_memory_report now + Util::RWLockAuto deviceHashSetLock(&m_deviceHashSetLock); + const Device* pDevice = m_deviceHashSet.Begin().Get()->key; + PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex); + uint32_t heapIndex = 0; + bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap, + &heapIndex); + VK_ASSERT(validHeap); + // Physical device is the default Vulkan object for allocations not specifically tracked otherwise. + auto* const pHandle = ApiPhysicalDevice::FromObject(pPhysicalDevice); + pAllocationData->objectHandle = ApiPhysicalDevice::IntValueFromHandle(pHandle); + pAllocationData->objectType = VK_OBJECT_TYPE_PHYSICAL_DEVICE; pAllocationData->reportedToDeviceMemoryReport = true; DeviceMemoryReportAllocateEvent( @@ -163,11 +182,10 @@ void GpuMemoryEventHandler::PalDeveloperCallback( if (pAllocationData != nullptr) { - // If this is a Pal internal free that is not suballocated report it to device_memory_report now - if ((pAllocationData->allocationData.flags.isClient == 0) && // Pal internal, not Vulkan - (pAllocationData->allocationData.flags.isCmdAllocator == 0) && // Command allocator is suballocated - (pAllocationData->allocationData.flags.buddyAllocated == 0) && // Buddy allocator is suballocated - (pAllocationData->allocationData.flags.isExternal == 0)) // vkCreateMemory handles external + // Report non-suballocated frees to device_memory_report and device_address_binding_report now, + // including Vulkan allocations + if ((pAllocationData->allocationData.flags.isCmdAllocator == 0) && // Command allocator is suballocated + (pAllocationData->allocationData.flags.buddyAllocated == 0)) // Buddy allocator is suballocated { if (pAllocationData->reportedToDeviceMemoryReport == true) { @@ -177,10 +195,13 @@ void GpuMemoryEventHandler::PalDeveloperCallback( pAllocationData->allocationData.pGpuMemory->Desc().uniqueId, pAllocationData->allocationData.flags.isExternal); } - else + else if ((pAllocationData->allocationData.flags.isClient != 1) && + (pAllocationData->allocationData.flags.isExternal != 1)) { PAL_ALERT_ALWAYS_MSG("Allocation freed that was never reported to device_memory_report"); } + + DeviceAddressBindingReportUnbindEvent(pAllocationData); } m_allocationHashMap.Erase(pGpuMemoryData->pGpuMemory); @@ -197,10 +218,10 @@ void GpuMemoryEventHandler::PalDeveloperCallback( Util::RWLockAuto lock(&m_palSubAllocationHashMapLock); auto*const pGpuMemoryData = reinterpret_cast(pCbData); - PAL_ASSERT_MSG((pGpuMemoryData->flags.isClient == 0) && // Pal internal allocation - (pGpuMemoryData->flags.isCmdAllocator == 0) && // Command allocator is suballocated Pal internal - (pGpuMemoryData->flags.buddyAllocated == 1) && // Buddy allocator is suballocated Pal internal - (pGpuMemoryData->flags.isExternal == 0) && // External memory is handled by vkCreateMemory + PAL_ASSERT_MSG((pGpuMemoryData->flags.isClient == 0) && // Pal internal allocation + ((pGpuMemoryData->flags.isCmdAllocator == 1) || // Command allocator, suballocated Pal internal + (pGpuMemoryData->flags.buddyAllocated == 1)) && // Buddy allocator is suballocated Pal internal + (pGpuMemoryData->flags.isExternal == 0) && // External memory is handled by vkCreateMemory (pGpuMemoryData->size < pGpuMemoryData->pGpuMemory->Desc().size), // Suballoc should be smaller "The base GPU allocation of this Pal internal suballocation is not as expected."); @@ -217,25 +238,41 @@ void GpuMemoryEventHandler::PalDeveloperCallback( // Add the new Pal suballocation if it did not exist already. if (exists == false) { - // Store the Pal suballocation information - pSubAllocData->allocationData = *pGpuMemoryData; - pSubAllocData->memoryObjectId = GenerateMemoryObjectId(); + Util::RWLockAuto deviceHashSetLock(&m_deviceHashSetLock); + const Device* pDevice = m_deviceHashSet.Begin().Get()->key; + PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex); + uint32_t heapIndex = 0; + bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap, + &heapIndex); + VK_ASSERT(validHeap); - m_callbacksLock.LockForRead(); - auto iter = m_callbacks.Begin(); - - if (iter.IsValid()) + // Store the Pal suballocation information + pSubAllocData->allocationData = *pGpuMemoryData; + pSubAllocData->memoryObjectId = GenerateMemoryObjectId(); + pSubAllocData->heapIndex = heapIndex; + pSubAllocData->offset = pGpuMemoryData->offset; + pSubAllocData->subAllocationSize = pGpuMemoryData->size; + pSubAllocData->objectHandle = NullHandle; + pSubAllocData->objectType = VK_OBJECT_TYPE_UNKNOWN; + pSubAllocData->reportedToDeviceMemoryReport = false; + + // Defer reporting of Pal buddy allocated suballocations to device_memory_report to + // ReportDeferredPalSubAlloc() but report CmdAllocator suballocations now + if (pGpuMemoryData->flags.isCmdAllocator) { - uint32_t heapIndex = 0; - auto*const pPhysicalDevice = (iter.Get().pDevice)->VkPhysicalDevice(DefaultDeviceIndex); - bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap, &heapIndex); - VK_ASSERT(validHeap); - - pSubAllocData->heapIndex = heapIndex; + auto* const pHandle = ApiPhysicalDevice::FromObject(pPhysicalDevice); + pSubAllocData->objectHandle = ApiPhysicalDevice::IntValueFromHandle(pHandle); + pSubAllocData->objectType = VK_OBJECT_TYPE_PHYSICAL_DEVICE; + pSubAllocData->reportedToDeviceMemoryReport = true; + + DeviceMemoryReportAllocateEvent( + pSubAllocData->objectHandle, + pSubAllocData->allocationData.size, + pSubAllocData->objectType, + pSubAllocData->memoryObjectId, + pSubAllocData->heapIndex, + pSubAllocData->allocationData.flags.isExternal); } - m_callbacksLock.UnlockForRead(); - - // Defer reporting of Pal suballocations to device_memory_report to ReportDeferredPalSubAlloc() } else { @@ -265,11 +302,17 @@ void GpuMemoryEventHandler::PalDeveloperCallback( pSubAllocData->memoryObjectId, pSubAllocData->allocationData.flags.isExternal); } - else + else if (pSubAllocData->allocationData.flags.isCmdAllocator == 1) { - //PAL_ALERT_ALWAYS_MSG("SubFree of a Pal suballocation that was never reported to device_memory_report"); + PAL_ALERT_ALWAYS_MSG("SubFree: CmdAllocator suballoc was never reported to device_memory_report"); + } + else if (pSubAllocData->allocationData.flags.buddyAllocated == 1) + { + PAL_ALERT_ALWAYS_MSG("SubFree: Buddy Allocated suballoc was never reported to device_memory_report"); } + DeviceAddressBindingReportUnbindEvent(pSubAllocData); + m_palSubAllocationHashMap.Erase(key); } else @@ -281,6 +324,38 @@ void GpuMemoryEventHandler::PalDeveloperCallback( } case Pal::Developer::CallbackType::BindGpuMemory: { + auto* const pBindGpuMemoryData = reinterpret_cast(pCbData); + + if (pBindGpuMemoryData->isSystemMemory == false) + { + Util::MutexAuto lock(&m_bindHashMapMutex); + + bool exists = false; + BindDataList* pBindDataList = nullptr; + + Pal::Result palResult = m_bindHashMap.FindAllocate(pBindGpuMemoryData->pGpuMemory, &exists, &pBindDataList); + + if (palResult == Pal::Result::Success) + { + if (exists == false) + { + pBindDataList = VK_PLACEMENT_NEW(pBindDataList) BindDataList(); + } + + BindDataListNode* pBindDataListNode = nullptr; + BindDataListNode::Create(m_pInstance, pBindGpuMemoryData, &pBindDataListNode); + + if (pBindDataListNode != nullptr) + { + DeviceAddressBindingReportUnbindEvent(pBindDataListNode->GetData()); + + pBindDataList->PushFront(pBindDataListNode->GetNode()); + + DeviceAddressBindingReportBindEvent(pBindDataListNode->GetData()); + } + } + } + break; } default: @@ -289,18 +364,30 @@ void GpuMemoryEventHandler::PalDeveloperCallback( } // ===================================================================================================================== -// GpuMemoryEventHandler is requested by VK_EXT_device_memory_report and/or VK_EXT_device_address_binding_report. -// Increment the reference count of requests for GPU memory events. -void GpuMemoryEventHandler::EnableGpuMemoryEvents() +// GpuMemoryEventHandler events are required for VK_EXT_device_memory_report and VK_EXT_device_address_binding_report. +// Increment the count of devices when one or more of these extensions enabled. +void GpuMemoryEventHandler::EnableGpuMemoryEvents( + const Device* pDevice) { - Util::AtomicIncrement(&m_memoryEventEnables); + Util::RWLockAuto lock(&m_deviceHashSetLock); + VK_ASSERT(m_deviceHashSet.Contains(pDevice) == false); + + Util::AtomicIncrement(&m_deviceCount); + + m_deviceHashSet.Insert(pDevice); } // ===================================================================================================================== -// Decrement the reference count of requests for GPU memory events. -void GpuMemoryEventHandler::DisableGpuMemoryEvents() +// Decrement the count of devices to remove a device with one or more extensions enabled requiring GPU memory events. +void GpuMemoryEventHandler::DisableGpuMemoryEvents( + const Device* pDevice) { - Util::AtomicDecrement(&m_memoryEventEnables); + Util::RWLockAuto lock(&m_deviceHashSetLock); + VK_ASSERT(m_deviceHashSet.Contains(pDevice)); + + Util::AtomicDecrement(&m_deviceCount); + + m_deviceHashSet.Erase(pDevice); } // ===================================================================================================================== @@ -328,36 +415,46 @@ void GpuMemoryEventHandler::UnregisterDeviceMemoryReportCallbacks( // ===================================================================================================================== void GpuMemoryEventHandler::VulkanAllocateEvent( + const Device* pDevice, const Pal::IGpuMemory* pGpuMemory, uint64_t objectHandle, VkObjectType objectType, uint64_t heapIndex) { - Util::RWLockAuto lock(&m_allocationHashMapLock); + Util::RWLockAuto lock(&m_allocationHashMapLock); AllocationData* pAllocationData = m_allocationHashMap.FindKey(pGpuMemory); if (pAllocationData != nullptr) { - if (pAllocationData->reportedToDeviceMemoryReport == false) + pAllocationData->objectType = objectType; + pAllocationData->objectHandle = objectHandle; + pAllocationData->allocationData.pGpuMemory = pGpuMemory; + + if (pDevice->GetEnabledFeatures().deviceMemoryReport) { - pAllocationData->reportedToDeviceMemoryReport = true; - pAllocationData->objectType = objectType; - pAllocationData->objectHandle = objectHandle; - pAllocationData->allocationData.pGpuMemory = pGpuMemory; - - const auto& gpuMemoryDesc = pAllocationData->allocationData.pGpuMemory->Desc(); - - DeviceMemoryReportAllocateEvent( - pAllocationData->objectHandle, - gpuMemoryDesc.size, - pAllocationData->objectType, - gpuMemoryDesc.uniqueId, - heapIndex, - gpuMemoryDesc.flags.isExternal); + if (pAllocationData->reportedToDeviceMemoryReport == false) + { + pAllocationData->reportedToDeviceMemoryReport = true; + + const auto& gpuMemoryDesc = pAllocationData->allocationData.pGpuMemory->Desc(); + + DeviceMemoryReportAllocateEvent( + pAllocationData->objectHandle, + gpuMemoryDesc.size, + pAllocationData->objectType, + gpuMemoryDesc.uniqueId, + heapIndex, + gpuMemoryDesc.flags.isExternal); + } + else + { + PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the allocation of an already reported allocation."); + } } - else + + if (pDevice->GetEnabledFeatures().deviceAddressBindingReport) { - PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the allocation of an already reported allocation."); + DeviceAddressBindingReportBindEvent(pAllocationData); } } else @@ -368,45 +465,20 @@ void GpuMemoryEventHandler::VulkanAllocateEvent( // ===================================================================================================================== void GpuMemoryEventHandler::VulkanAllocationFailedEvent( + const Device* pDevice, Pal::gpusize allocatedSize, VkObjectType objectType, uint64_t heapIndex) { - DeviceMemoryReportAllocationFailedEvent(allocatedSize, objectType, heapIndex); -} - -// ===================================================================================================================== -void GpuMemoryEventHandler::VulkanFreeEvent( - const Pal::IGpuMemory* pGpuMemory) -{ - Util::RWLockAuto lock(&m_allocationHashMapLock); - AllocationData* pAllocationData = m_allocationHashMap.FindKey(pGpuMemory); - - if (pAllocationData != nullptr) + if (pDevice->GetEnabledFeatures().deviceMemoryReport) { - if (pAllocationData->reportedToDeviceMemoryReport == true) - { - const auto& gpuMemoryDesc = pAllocationData->allocationData.pGpuMemory->Desc(); - - DeviceMemoryReportFreeEvent( - pAllocationData->objectHandle, - pAllocationData->objectType, - gpuMemoryDesc.uniqueId, - gpuMemoryDesc.flags.isExternal); - } - else - { - PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an unreported allocation."); - } - } - else - { - PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an untracked allocation."); + DeviceMemoryReportAllocationFailedEvent(allocatedSize, objectType, heapIndex); } } // ===================================================================================================================== void GpuMemoryEventHandler::VulkanSubAllocateEvent( + const Device* pDevice, const Pal::IGpuMemory* pGpuMemory, Pal::gpusize offset, Pal::gpusize subAllocationSize, @@ -427,7 +499,6 @@ void GpuMemoryEventHandler::VulkanSubAllocateEvent( { if (exists == false) { - pSubAllocData->reportedToDeviceMemoryReport = true; pSubAllocData->allocationData.pGpuMemory = pGpuMemory; pSubAllocData->memoryObjectId = GenerateMemoryObjectId(); pSubAllocData->objectType = objectType; @@ -436,13 +507,23 @@ void GpuMemoryEventHandler::VulkanSubAllocateEvent( pSubAllocData->objectHandle = objectHandle; pSubAllocData->heapIndex = heapIndex; - DeviceMemoryReportAllocateEvent( - pSubAllocData->objectHandle, - pSubAllocData->subAllocationSize, - pSubAllocData->objectType, - pSubAllocData->memoryObjectId, - pSubAllocData->heapIndex, - pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal); + if (pDevice->GetEnabledFeatures().deviceMemoryReport) + { + pSubAllocData->reportedToDeviceMemoryReport = true; + + DeviceMemoryReportAllocateEvent( + pSubAllocData->objectHandle, + pSubAllocData->subAllocationSize, + pSubAllocData->objectType, + pSubAllocData->memoryObjectId, + pSubAllocData->heapIndex, + pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal); + } + + if (pDevice->GetEnabledFeatures().deviceAddressBindingReport) + { + DeviceAddressBindingReportBindEvent(pSubAllocData); + } } else { @@ -453,6 +534,7 @@ void GpuMemoryEventHandler::VulkanSubAllocateEvent( // ===================================================================================================================== void GpuMemoryEventHandler::VulkanSubFreeEvent( + const Device* pDevice, const Pal::IGpuMemory* pGpuMemory, Pal::gpusize offset) { @@ -464,17 +546,25 @@ void GpuMemoryEventHandler::VulkanSubFreeEvent( if (pSubAllocData != nullptr) { - if (pSubAllocData->reportedToDeviceMemoryReport == true) + if (pDevice->GetEnabledFeatures().deviceMemoryReport) { - DeviceMemoryReportFreeEvent( - pSubAllocData->objectHandle, - pSubAllocData->objectType, - pSubAllocData->memoryObjectId, - pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal); + if (pSubAllocData->reportedToDeviceMemoryReport) + { + DeviceMemoryReportFreeEvent( + pSubAllocData->objectHandle, + pSubAllocData->objectType, + pSubAllocData->memoryObjectId, + pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal); + } + else + { + PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an unreported Vulkan suballocation."); + } } - else + + if (pDevice->GetEnabledFeatures().deviceAddressBindingReport) { - PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an unreported Vulkan suballocation."); + DeviceAddressBindingReportUnbindEvent(pSubAllocData); } m_vulkanSubAllocationHashMap.Erase(key); @@ -575,12 +665,13 @@ void GpuMemoryEventHandler::DeviceMemoryReportFreeEvent( // ===================================================================================================================== void GpuMemoryEventHandler::ReportDeferredPalSubAlloc( + const Device* pDevice, Pal::gpusize gpuVirtAddr, Pal::gpusize offset, const uint64_t objectHandle, const VkObjectType objectType) { - Util::RWLockAuto lock(&m_palSubAllocationHashMapLock); + Util::RWLockAuto lock(&m_palSubAllocationHashMapLock); SubAllocationKey key = {gpuVirtAddr, offset}; @@ -589,24 +680,28 @@ void GpuMemoryEventHandler::ReportDeferredPalSubAlloc( if (pSubAllocData != nullptr) { - if (pSubAllocData->reportedToDeviceMemoryReport == false) - { - // Report deferred Pal suballocation to device_memory_report now - pSubAllocData->objectHandle = objectHandle; - pSubAllocData->objectType = objectType; - pSubAllocData->reportedToDeviceMemoryReport = true; - - DeviceMemoryReportAllocateEvent( - pSubAllocData->objectHandle, - pSubAllocData->allocationData.size, - pSubAllocData->objectType, - pSubAllocData->memoryObjectId, - pSubAllocData->heapIndex, - pSubAllocData->allocationData.flags.isExternal); - } - else + pSubAllocData->objectHandle = objectHandle; + pSubAllocData->objectType = objectType; + + if (pDevice->GetEnabledFeatures().deviceMemoryReport) { - PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the allocation of an already reported Pal suballocation."); + if (pSubAllocData->reportedToDeviceMemoryReport == false) + { + // Report deferred Pal suballocation to device_memory_report now + pSubAllocData->reportedToDeviceMemoryReport = true; + + DeviceMemoryReportAllocateEvent( + pSubAllocData->objectHandle, + pSubAllocData->allocationData.size, + pSubAllocData->objectType, + pSubAllocData->memoryObjectId, + pSubAllocData->heapIndex, + pSubAllocData->allocationData.flags.isExternal); + } + else + { + PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report allocation of an already reported Pal suballoc."); + } } } else @@ -619,7 +714,7 @@ void GpuMemoryEventHandler::ReportDeferredPalSubAlloc( void GpuMemoryEventHandler::SendDeviceMemoryReportEvent( const VkDeviceMemoryReportCallbackDataEXT& callbackData) { - Util::RWLockAuto lock(&m_callbacksLock); + Util::RWLockAuto lock(&m_callbacksLock); for (auto iter = m_callbacks.Begin(); iter.IsValid(); iter.Next()) { @@ -627,4 +722,377 @@ void GpuMemoryEventHandler::SendDeviceMemoryReportEvent( } } +// ===================================================================================================================== +// Creates the BindDataListNode class. +void GpuMemoryEventHandler::BindDataListNode::Create( + Instance* pInstance, + Pal::Developer::BindGpuMemoryData* pBindGpuMemoryData, + GpuMemoryEventHandler::BindDataListNode** ppObject) +{ + void* pSystemMem = pInstance->AllocMem( + sizeof(GpuMemoryEventHandler::BindDataListNode), + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (pSystemMem != nullptr) + { + BindDataListNode* pBindDataListNode = + VK_PLACEMENT_NEW(pSystemMem) BindDataListNode(pInstance, pBindGpuMemoryData); + + *ppObject = pBindDataListNode; + } +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::BindDataListNode::Destroy() +{ + Util::Destructor(this); + + m_pInstance->FreeMem(this); +} + +// ===================================================================================================================== +GpuMemoryEventHandler::BindDataListNode::BindDataListNode( + Instance* pInstance, + Pal::Developer::BindGpuMemoryData* pBindGpuMemoryData) + : m_pInstance(pInstance) + , m_node(this) +{ + m_data.bindGpuMemoryData = *pBindGpuMemoryData; + m_data.objectHandle = NullHandle; + m_data.objectType = VK_OBJECT_TYPE_UNKNOWN; + m_data.reportedToDeviceAddressBindingReport = false; +} + +// ===================================================================================================================== +bool GpuMemoryEventHandler::CheckIntervalsIntersect( + const Interval& intervalOne, + const Interval& intervalTwo + ) const +{ + bool intersect = false; + + if (intervalOne.m_offset < intervalTwo.m_offset) + { + intersect = intervalTwo.m_offset <= (intervalOne.m_offset + intervalOne.m_size); + } + else + { + intersect = intervalOne.m_offset <= (intervalTwo.m_offset + intervalTwo.m_size); + } + + return intersect; +} + +// ===================================================================================================================== +// The caller of this function must hold the m_bindHashMapMutex mutex +void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent( + const Pal::IGpuMemory* pGpuMemory, + const Interval& interval) +{ + BindDataList* pBindDataList = m_bindHashMap.FindKey(pGpuMemory); + + if (pBindDataList != nullptr) + { + auto iter = pBindDataList->Begin(); + while(iter.IsValid()) + { + BindData* pBindData = iter.Get()->GetData(); + bool intersect = true; + + if (interval.m_size > 0) + { + intersect = CheckIntervalsIntersect( + Interval(pBindData->bindGpuMemoryData.offset, pBindData->bindGpuMemoryData.requiredGpuMemSize), + interval); + } + + if (intersect) + { + BindDataListNode* pBindDataListNode = iter.Get(); + if (pBindData->reportedToDeviceAddressBindingReport) + { + ReportUnbindEvent(pBindData); + } + else + { + PAL_ALERT_ALWAYS_MSG("Trying to report unbind, but bind was not reported previously."); + } + + pBindDataList->Erase(&iter); + pBindDataListNode->Destroy(); + } + else + { + iter.Next(); + } + } + } +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::DeviceAddressBindingReportBindEvent( + const AllocationData* pAllocationData) +{ + Util::MutexAuto lock(&m_bindHashMapMutex); + BindDataList* pBindDataList = m_bindHashMap.FindKey(pAllocationData->allocationData.pGpuMemory); + + if (pBindDataList != nullptr) + { + for (auto iter = pBindDataList->Begin(); iter.IsValid(); iter.Next()) + { + ReportBindEvent(iter.Get()->GetData(), pAllocationData->objectHandle, pAllocationData->objectType); + } + } +} +// ===================================================================================================================== +void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent( + const AllocationData* pAllocationData) +{ + Util::MutexAuto lock(&m_bindHashMapMutex); + DeviceAddressBindingReportUnbindEvent(pAllocationData->allocationData.pGpuMemory, Interval()); +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::DeviceAddressBindingReportBindEvent( + const SubAllocationData* pSubAllocData) +{ + Util::MutexAuto lock(&m_bindHashMapMutex); + BindDataList* pBindDataList = m_bindHashMap.FindKey(pSubAllocData->allocationData.pGpuMemory); + + if (pBindDataList != nullptr) + { + for (auto iter = pBindDataList->Begin(); iter.IsValid(); iter.Next()) + { + BindData* pBindData = iter.Get()->GetData(); + + bool intersect = CheckIntervalsIntersect( + Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize), + Interval(pBindData->bindGpuMemoryData.offset, pBindData->bindGpuMemoryData.requiredGpuMemSize)); + + if (intersect) + { + ReportBindEvent(pBindData, pSubAllocData->objectHandle, pSubAllocData->objectType); + } + } + } +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent( + const SubAllocationData* pSubAllocData) +{ + Util::MutexAuto lock(&m_bindHashMapMutex); + DeviceAddressBindingReportUnbindEvent( + pSubAllocData->allocationData.pGpuMemory, + Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize)); +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::DeviceAddressBindingReportBindEvent( + BindData* pNewBindData) +{ + // The caller of this function must hold the m_bindHashMapMutex mutex + m_allocationHashMapLock.LockForRead(); + + const AllocationData* pAllocationData = m_allocationHashMap.FindKey(pNewBindData->bindGpuMemoryData.pGpuMemory); + + if (pAllocationData != nullptr) + { + if (pAllocationData->objectHandle != NullHandle) + { + ReportBindEvent(pNewBindData, pAllocationData->objectHandle, pAllocationData->objectType); + } + else if ((pAllocationData->allocationData.flags.isClient == 0) && + (pAllocationData->allocationData.flags.buddyAllocated == 0) && + (pAllocationData->allocationData.flags.isCmdAllocator == 0) && + (pAllocationData->allocationData.flags.isExternal == 0)) + { + const Device* pDevice = m_deviceHashSet.Begin().Get()->key; + PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex); + auto* const pHandle = ApiPhysicalDevice::FromObject(pPhysicalDevice); + + // Pal internal allocation; attribute this to the physical device + ReportBindEvent( + pNewBindData, + ApiPhysicalDevice::IntValueFromHandle(pHandle), + VK_OBJECT_TYPE_PHYSICAL_DEVICE); + } + } + m_allocationHashMapLock.UnlockForRead(); + + if (pNewBindData->reportedToDeviceAddressBindingReport == false) + { + m_palSubAllocationHashMapLock.LockForRead(); + for (auto iter = m_palSubAllocationHashMap.Begin(); iter.Get() != nullptr; iter.Next()) + { + const SubAllocationData* pSubAllocData = &iter.Get()->value; + if (pSubAllocData->allocationData.pGpuMemory != pNewBindData->bindGpuMemoryData.pGpuMemory) + { + continue; + } + + bool intersect = CheckIntervalsIntersect( + Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize), + Interval(pNewBindData->bindGpuMemoryData.offset, pNewBindData->bindGpuMemoryData.requiredGpuMemSize)); + + if (intersect) + { + if (pSubAllocData->objectHandle != NullHandle) + { + ReportBindEvent(pNewBindData, pSubAllocData->objectHandle, pSubAllocData->objectType); + } + else if ((pSubAllocData->allocationData.flags.isClient == 0) && + ((pSubAllocData->allocationData.flags.buddyAllocated == 1) || + (pSubAllocData->allocationData.flags.isCmdAllocator == 1)) && + (pSubAllocData->allocationData.flags.isExternal == 0)) + { + const Device* pDevice = m_deviceHashSet.Begin().Get()->key; + PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex); + auto* const pHandle = ApiPhysicalDevice::FromObject(pPhysicalDevice); + + // Pal internal allocation; attribute this to the physical device + ReportBindEvent( + pNewBindData, + ApiPhysicalDevice::IntValueFromHandle(pHandle), + VK_OBJECT_TYPE_PHYSICAL_DEVICE); + } + break; + } + } + m_palSubAllocationHashMapLock.UnlockForRead(); + } + + if (pNewBindData->reportedToDeviceAddressBindingReport == false) + { + m_vulkanSubAllocationHashMapLock.LockForRead(); + for (auto iter = m_vulkanSubAllocationHashMap.Begin(); iter.Get() != nullptr; iter.Next()) + { + const SubAllocationData* pSubAllocData = &iter.Get()->value; + if (pSubAllocData->allocationData.pGpuMemory != pNewBindData->bindGpuMemoryData.pGpuMemory) + { + continue; + } + + if (pSubAllocData->objectHandle == NullHandle) + { + continue; + } + + bool intersect = CheckIntervalsIntersect( + Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize), + Interval(pNewBindData->bindGpuMemoryData.offset, pNewBindData->bindGpuMemoryData.requiredGpuMemSize)); + + if (intersect) + { + ReportBindEvent(pNewBindData, pSubAllocData->objectHandle, pSubAllocData->objectType); + break; + } + } + m_vulkanSubAllocationHashMapLock.UnlockForRead(); + } +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent( + BindData* pNewBindData) +{ + // The caller of this function must hold the m_bindHashMapMutex mutex + DeviceAddressBindingReportUnbindEvent( + pNewBindData->bindGpuMemoryData.pGpuMemory, + Interval(pNewBindData->bindGpuMemoryData.offset, pNewBindData->bindGpuMemoryData.requiredGpuMemSize)); +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::ReportBindEvent( + BindData* pBindData, + uint64_t objectHandle, + VkObjectType objectType) +{ + if (pBindData->reportedToDeviceAddressBindingReport == false) + { + pBindData->objectHandle = objectHandle; + pBindData->objectType = objectType; + pBindData->reportedToDeviceAddressBindingReport = true; + + DeviceAddressBindingReportCallback( + pBindData->objectHandle, + pBindData->objectType, + VK_DEVICE_ADDRESS_BINDING_TYPE_BIND_EXT, + pBindData->bindGpuMemoryData.pGpuMemory->Desc().gpuVirtAddr + pBindData->bindGpuMemoryData.offset, + pBindData->bindGpuMemoryData.requiredGpuMemSize, + pBindData->objectHandle == NullHandle); + } + else + { + PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report an already reported bind"); + } +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::ReportUnbindEvent( + BindData* pBindData) +{ + if (pBindData->reportedToDeviceAddressBindingReport) + { + DeviceAddressBindingReportCallback( + pBindData->objectHandle, + pBindData->objectType, + VK_DEVICE_ADDRESS_BINDING_TYPE_UNBIND_EXT, + pBindData->bindGpuMemoryData.pGpuMemory->Desc().gpuVirtAddr + pBindData->bindGpuMemoryData.offset, + pBindData->bindGpuMemoryData.requiredGpuMemSize, + pBindData->objectHandle == NullHandle); + } + else + { + PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report unbind of an unreported Vulkan bind."); + } +} + +// ===================================================================================================================== +void GpuMemoryEventHandler::DeviceAddressBindingReportCallback( + uint64_t objectHandle, + VkObjectType objectType, + VkDeviceAddressBindingTypeEXT bindingType, + VkDeviceAddress bindingAddress, + VkDeviceSize allocatedSize, + bool isInternal) +{ + VkDeviceAddressBindingCallbackDataEXT bindingCallbackData = {}; + + bindingCallbackData.sType = VK_STRUCTURE_TYPE_DEVICE_ADDRESS_BINDING_CALLBACK_DATA_EXT; + bindingCallbackData.pNext = nullptr; + bindingCallbackData.flags = isInternal ? VK_DEVICE_ADDRESS_BINDING_INTERNAL_OBJECT_BIT_EXT : 0; + bindingCallbackData.baseAddress = bindingAddress; + bindingCallbackData.size = allocatedSize; + bindingCallbackData.bindingType = bindingType; + + VkDebugUtilsObjectNameInfoEXT objectNameInfo = {}; + + objectNameInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT; + objectNameInfo.pNext = nullptr; + objectNameInfo.objectType = objectType; + objectNameInfo.objectHandle = objectHandle; + objectNameInfo.pObjectName = nullptr; + + VkDebugUtilsMessengerCallbackDataEXT callbackData = {}; + + callbackData.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CALLBACK_DATA_EXT; + callbackData.pNext = &bindingCallbackData; + callbackData.flags = 0; // reserved for future use + callbackData.pMessageIdName = nullptr; + callbackData.messageIdNumber = 0; + callbackData.pMessage = nullptr; + callbackData.queueLabelCount = 0; + callbackData.pQueueLabels = nullptr; + callbackData.cmdBufLabelCount = 0; + callbackData.pCmdBufLabels = nullptr; + callbackData.objectCount = 1; + callbackData.pObjects = &objectNameInfo; + + m_pInstance->CallExternalMessengers( + VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT, + VK_DEBUG_UTILS_MESSAGE_TYPE_DEVICE_ADDRESS_BINDING_BIT_EXT, + &callbackData); +} + } // namespace vk diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp index d3ed09d0..a477e89c 100644 --- a/icd/api/graphics_pipeline_common.cpp +++ b/icd/api/graphics_pipeline_common.cpp @@ -293,8 +293,6 @@ void GraphicsPipelineCommon::GetSubpassSampleCount( // subpassCoverageSampleCount would be equal to zero if there are zero attachments. coverageSampleCount = (coverageSampleCount == 0) ? rasterizationSampleCount : coverageSampleCount; - VK_ASSERT(rasterizationSampleCount == coverageSampleCount); - if (pCoverageSampleCount != nullptr) { *pCoverageSampleCount = coverageSampleCount; @@ -1012,9 +1010,6 @@ static void BuildRasterizationState( pInfo->staticStateMask |= 1ULL << static_cast(DynamicStatesInternal::DepthBias); } - // point size must be set via gl_PointSize, otherwise it must be 1.0f. - constexpr float DefaultPointSize = 1.0f; - pInfo->immedInfo.pointLineRasterParams.lineWidth = pRs->lineWidth; pInfo->immedInfo.pointLineRasterParams.pointSize = DefaultPointSize; pInfo->immedInfo.pointLineRasterParams.pointSizeMin = limits.pointSizeRange[0]; @@ -1397,12 +1392,13 @@ static void BuildMultisampleState( pInfo->immedInfo.msaaCreateInfo.flags.enable1xMsaaSampleLocations = (pInfo->immedInfo.msaaCreateInfo.coverageSamples == 1); - if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::SampleLocations) == false) + if ((IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::SampleLocations) == false) && + (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::RasterizationSamples) == false)) { if (pPipelineSampleLocationsStateCreateInfoEXT != nullptr) { // We store the custom sample locations if custom sample locations are enabled and the - // sample locations state is static. + // sample locations state is static and rasterizationSamples is not configured dynamically. pInfo->immedInfo.samplePattern.sampleCount = (uint32_t)pPipelineSampleLocationsStateCreateInfoEXT->sampleLocationsInfo.sampleLocationsPerPixel; @@ -1424,9 +1420,10 @@ static void BuildMultisampleState( (1ULL << static_cast(DynamicStatesInternal::SampleLocations)); } } - else + else if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::RasterizationSamples) == false) { - // We store the standard sample locations if custom sample locations are not enabled. + // We store the standard sample locations if custom sample locations are not enabled and + // rasterizationSamples is not configured dynamically. pInfo->immedInfo.samplePattern.sampleCount = pMs->rasterizationSamples; pInfo->immedInfo.samplePattern.locations = *Device::GetDefaultQuadSamplePattern(pMs->rasterizationSamples); diff --git a/icd/api/include/app_profile.h b/icd/api/include/app_profile.h index 213d11c5..764fb497 100644 --- a/icd/api/include/app_profile.h +++ b/icd/api/include/app_profile.h @@ -69,6 +69,7 @@ enum class AppProfile : uint32_t DawnOfWarIII, // Dawn of War III by Feral3D WarHammerII, // Total War: WarHammer II by Feral3D WarHammerIII, // Total War: WarHammer III by Feral3D + RomeRemastered, // Total War Rome Remastered AshesOfTheSingularity, // Ashes Of The Singularity StrangeBrigade, // Strange Brigade WorldWarZ, // WorldWarZ @@ -129,6 +130,7 @@ enum class AppProfile : uint32_t Satisfactory, // Satisfactory by Coffee Stain Studios QuakeEnhanced, // Quake Enhanced by id Software Zink, // Zink + SOTTR, // Shadow of the tomb raider steam version }; struct ProfileSettings diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index 159ad80d..40412dee 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -46,6 +46,7 @@ namespace vk class PhysicalDevice; class PipelineCache; +class PipelineBinaryCache; class ShaderCache; class DeferredHostOperation; @@ -64,8 +65,8 @@ enum FreeCompilerBinary : uint32_t struct ShaderModuleHandle { uint32_t* pRefCount; - void* pLlpcShaderModule; // Shader module handle from LLPC Vkgc::BinaryData elfPackage; // Generated ElfPacekage from LLPC + void* pLlpcShaderModule; // Shader module handle from LLPC }; // ===================================================================================================================== @@ -143,6 +144,9 @@ struct GraphicsPipelineBinaryCreateInfo PipelineCompilerType compilerType; bool linkTimeOptimization; Vkgc::BinaryData earlyElfPackage[ShaderStage::ShaderStageGfxCount]; + Util::MetroHash::Hash earlyElfPackageHash[ShaderStage::ShaderStageGfxCount]; + uint64_t apiPsoHash; + uint64_t libraryHash[ShaderStage::ShaderStageGfxCount]; FreeCompilerBinary freeCompilerBinary; PipelineCreationFeedback pipelineFeedback; PipelineCreationFeedback stageFeedback[ShaderStage::ShaderStageGfxCount]; @@ -167,6 +171,7 @@ struct ComputePipelineBinaryCreateInfo PipelineCreationFeedback pipelineFeedback; PipelineCreationFeedback stageFeedback; PipelineMetadata* pBinaryMetadata; + uint64_t apiPsoHash; }; #if VKI_RAY_TRACING @@ -186,6 +191,7 @@ struct RayTracingPipelineBinaryCreateInfo uint32_t maxAttributeSize; bool allowShaderInlining; DeferredWorkload* pDeferredWorkload; + uint64_t apiPsoHash; }; // ===================================================================================================================== @@ -209,22 +215,14 @@ class CompilerSolution CompilerSolution(PhysicalDevice* pPhysicalDevice); virtual ~CompilerSolution(); - virtual VkResult Initialize(Vkgc::GfxIpVersion gfxIp, Pal::GfxIpLevel gfxIpLevel, Vkgc::ICache* pCache) = 0; + virtual VkResult Initialize(Vkgc::GfxIpVersion gfxIp, Pal::GfxIpLevel gfxIpLevel, PipelineBinaryCache* pCache); virtual void Destroy() = 0; - virtual size_t GetShaderCacheSize(PipelineCompilerType cacheType) = 0; - - virtual VkResult CreateShaderCache( - const void* pInitialData, - size_t initialDataSize, - void* pShaderCacheMem, - uint32_t expectedEntries, - ShaderCache* pShaderCache) = 0; - virtual VkResult BuildShaderModule( const Device* pDevice, VkShaderModuleCreateFlags flags, + VkShaderModuleCreateFlags internalShaderFlags, size_t codeSize, const void* pCode, const bool adaptForFastLink, @@ -253,6 +251,7 @@ class CompilerSolution virtual VkResult CreateGraphicsShaderBinary( const Device* pDevice, + PipelineCache* pPipelineCache, const ShaderStage stage, GraphicsPipelineBinaryCreateInfo* pCreateInfo, void* pPipelineDumpHandle, diff --git a/icd/api/include/compiler_solution_llpc.h b/icd/api/include/compiler_solution_llpc.h index 0058968d..189d7126 100644 --- a/icd/api/include/compiler_solution_llpc.h +++ b/icd/api/include/compiler_solution_llpc.h @@ -77,22 +77,17 @@ class CompilerSolutionLlpc final : public CompilerSolution public: // Overridden functions - virtual VkResult Initialize(Vkgc::GfxIpVersion gfxIp, Pal::GfxIpLevel gfxIpLevel, Vkgc::ICache* pCache) override; + virtual VkResult Initialize( + Vkgc::GfxIpVersion gfxIp, + Pal::GfxIpLevel gfxIpLevel, + PipelineBinaryCache* pCache) override; virtual void Destroy() override; - virtual size_t GetShaderCacheSize(PipelineCompilerType cacheType) override; - - virtual VkResult CreateShaderCache( - const void* pInitialData, - size_t initialDataSize, - void* pShaderCacheMem, - uint32_t expectedEntries, - ShaderCache* pShaderCache) override; - virtual VkResult BuildShaderModule( const Device* pDevice, VkShaderModuleCreateFlags flags, + VkShaderModuleCreateFlags internalShaderFlags, size_t codeSize, const void* pCode, const bool adaptForFastLink, @@ -121,6 +116,7 @@ class CompilerSolutionLlpc final : public CompilerSolution virtual VkResult CreateGraphicsShaderBinary( const Device* pDevice, + PipelineCache* pPipelineCache, const ShaderStage stage, GraphicsPipelineBinaryCreateInfo* pCreateInfo, void* pPipelineDumpHandle, diff --git a/icd/api/include/gpumemory_event_handler.h b/icd/api/include/gpumemory_event_handler.h index 26606875..a1a40588 100644 --- a/icd/api/include/gpumemory_event_handler.h +++ b/icd/api/include/gpumemory_event_handler.h @@ -32,7 +32,9 @@ #include "include/vk_alloccb.h" #include "include/vk_utils.h" +#include "palIntrusiveList.h" #include "palHashMap.h" +#include "palHashSet.h" #include "palMutex.h" #include "palUtil.h" #include "palVector.h" @@ -61,18 +63,20 @@ class GpuMemoryEventHandler Pal::Developer::CallbackType type, void* pCbData); - void EnableGpuMemoryEvents(); + void EnableGpuMemoryEvents( + const Device* pDevice); - void DisableGpuMemoryEvents(); + void DisableGpuMemoryEvents( + const Device* pDevice); - VK_FORCEINLINE bool IsGpuMemoryEventHandlerEnabled() { return m_memoryEventEnables > 0; } + VK_FORCEINLINE bool IsGpuMemoryEventHandlerEnabled() { return m_deviceCount > 0; } - typedef struct + struct DeviceMemoryReportCallback { PFN_vkDeviceMemoryReportCallbackEXT callback; void* pData; const Device* pDevice; - } DeviceMemoryReportCallback; + }; typedef Util::Vector DeviceMemoryReportCallbacks; @@ -83,20 +87,20 @@ class GpuMemoryEventHandler const Device* pDevice); void VulkanAllocateEvent( + const Device* pDevice, const Pal::IGpuMemory* pGpuMemory, uint64_t objectHandle, VkObjectType objectType, uint64_t heapIndex); void VulkanAllocationFailedEvent( + const Device* pDevice, Pal::gpusize allocatedSize, VkObjectType objectType, uint64_t heapIndex); - void VulkanFreeEvent( - const Pal::IGpuMemory* pGpuMemory); - void VulkanSubAllocateEvent( + const Device* pDevice, const Pal::IGpuMemory* pGpuMemory, Pal::gpusize offset, Pal::gpusize subAllocationSize, @@ -105,10 +109,12 @@ class GpuMemoryEventHandler uint64_t heapIndex); void VulkanSubFreeEvent( + const Device* pDevice, const Pal::IGpuMemory* pGpuMemory, Pal::gpusize offset); void ReportDeferredPalSubAlloc( + const Device* pDevice, Pal::gpusize gpuVirtAddr, Pal::gpusize offset, const uint64_t objectHandle, @@ -121,6 +127,77 @@ class GpuMemoryEventHandler PAL_DISALLOW_COPY_AND_ASSIGN(GpuMemoryEventHandler); + struct AllocationData + { + Pal::Developer::GpuMemoryData allocationData; + uint64_t objectHandle; + VkObjectType objectType; + bool reportedToDeviceMemoryReport; + }; + + struct SubAllocationData + { + Pal::Developer::GpuMemoryData allocationData; + uint64_t objectHandle; + VkObjectType objectType; + bool reportedToDeviceMemoryReport; + uint64_t memoryObjectId; + Pal::gpusize subAllocationSize; + Pal::gpusize offset; + uint64_t heapIndex; + }; + + struct BindData + { + Pal::Developer::BindGpuMemoryData bindGpuMemoryData; + uint64_t objectHandle; + VkObjectType objectType; + bool reportedToDeviceAddressBindingReport; + }; + + class BindDataListNode + { + public: + static void Create( + Instance* pInstance, + Pal::Developer::BindGpuMemoryData* pBindGpuMemoryData, + BindDataListNode** ppObject); + + void Destroy(); + + BindData* GetData() { return &m_data; } + Util::IntrusiveListNode* GetNode() { return &m_node; } + + private: + BindDataListNode( + Instance* pInstance, + Pal::Developer::BindGpuMemoryData* pBindGpuMemoryData); + + Instance* m_pInstance; + BindData m_data; + Util::IntrusiveListNode m_node; + + PAL_DISALLOW_COPY_AND_ASSIGN(BindDataListNode); + }; + + struct Interval + { + Interval() + : m_offset(0), m_size(0) + { + } + + Interval(const Pal::gpusize offset,const Pal::gpusize size) + : m_offset(offset), m_size(size) + { + } + + Pal::gpusize m_offset; + Pal::gpusize m_size; + }; + + static_assert(std::is_standard_layout::value); + void HandlePalDeveloperCallback( Pal::Developer::CallbackType type, void* pCbData); @@ -147,6 +224,49 @@ class GpuMemoryEventHandler void SendDeviceMemoryReportEvent( const VkDeviceMemoryReportCallbackDataEXT& callbackData); + // The caller of this function must hold the m_bindHashMapLock lock for read/write + void DeviceAddressBindingReportUnbindEvent( + const Pal::IGpuMemory* pGpuMemory, + const Interval& interval); + + void DeviceAddressBindingReportBindEvent( + const AllocationData* pAllocationData); + + void DeviceAddressBindingReportUnbindEvent( + const AllocationData* pAllocationData); + + void DeviceAddressBindingReportBindEvent( + const SubAllocationData* pSubAllocData); + + void DeviceAddressBindingReportUnbindEvent( + const SubAllocationData* pSubAllocData); + + void DeviceAddressBindingReportBindEvent( + BindData* pNewBindData); + + void DeviceAddressBindingReportUnbindEvent( + BindData* pNewBindData); + + void DeviceAddressBindingReportCallback( + uint64_t objectHandle, + VkObjectType objectType, + VkDeviceAddressBindingTypeEXT bindingType, + VkDeviceAddress bindingAddress, + VkDeviceSize allocatedSize, + bool isInternal); + + void ReportBindEvent( + BindData* pBindData, + uint64_t objectHandle, + VkObjectType objectType); + + void ReportUnbindEvent( + BindData* pBindData); + + bool CheckIntervalsIntersect( + const Interval& intervalOne, + const Interval& intervalTwo) const; + // Generates an ID, unique within the instance, for a GPU memory object uint64_t GenerateMemoryObjectId() { return Util::AtomicIncrement64(&m_memoryObjectId); } @@ -155,14 +275,6 @@ class GpuMemoryEventHandler DeviceMemoryReportCallbacks m_callbacks; Util::RWLock m_callbacksLock; - typedef struct - { - Pal::Developer::GpuMemoryData allocationData; - uint64_t objectHandle; - VkObjectType objectType; - bool reportedToDeviceMemoryReport; - } AllocationData; - typedef Util::HashMap GpuMemoryAllocationHashMap; @@ -170,37 +282,39 @@ class GpuMemoryEventHandler GpuMemoryAllocationHashMap m_allocationHashMap; Util::RWLock m_allocationHashMapLock; - typedef struct + struct SubAllocationKey { Pal::gpusize gpuVirtAddr; Pal::gpusize offset; - } SubAllocationKey; - - typedef struct - { - Pal::Developer::GpuMemoryData allocationData; - uint64_t objectHandle; - VkObjectType objectType; - bool reportedToDeviceMemoryReport; - uint64_t memoryObjectId; - Pal::gpusize subAllocationSize; - Pal::gpusize offset; - uint64_t heapIndex; - } SubAllocationData; + }; typedef Util::HashMap GpuMemorySubAllocationHashMap; + typedef Util::IntrusiveList BindDataList; + + typedef Util::HashMap GpuMemoryBindHashMap; + GpuMemorySubAllocationHashMap m_vulkanSubAllocationHashMap; Util::RWLock m_vulkanSubAllocationHashMapLock; GpuMemorySubAllocationHashMap m_palSubAllocationHashMap; Util::RWLock m_palSubAllocationHashMapLock; + GpuMemoryBindHashMap m_bindHashMap; + Util::Mutex m_bindHashMapMutex; + + typedef Util::HashSet DeviceHashSet; + + DeviceHashSet m_deviceHashSet; + Util::RWLock m_deviceHashSetLock; + volatile uint64_t m_memoryObjectId; // Seed for memoryObjectId generation - volatile uint32_t m_memoryEventEnables; // The number of device extensions requesting memory events + volatile uint32_t m_deviceCount; // The number of devices with extensions that require memory events }; } // namespace vk diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h index c0936964..a2e7ed3c 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h @@ -68,7 +68,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 246 +#define VK_HEADER_VERSION 250 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION) @@ -1002,6 +1002,7 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_OPTICAL_FLOW_SESSION_CREATE_PRIVATE_DATA_INFO_NV = 1000464010, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LEGACY_DITHERING_FEATURES_EXT = 1000465000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_PROTECTED_ACCESS_FEATURES_EXT = 1000466000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_POSITION_FETCH_FEATURES_KHR = 1000481000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_FEATURES_EXT = 1000482000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_PROPERTIES_EXT = 1000482001, VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT = 1000482002, @@ -1019,6 +1020,7 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_LIBRARY_GROUP_HANDLES_FEATURES_EXT = 1000498000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PER_VIEW_RENDER_AREAS_FEATURES_QCOM = 1000510000, VK_STRUCTURE_TYPE_MULTIVIEW_PER_VIEW_RENDER_AREAS_RENDER_PASS_BEGIN_INFO_QCOM = 1000510001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_FEATURES_EXT = 1000524000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES, VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, @@ -1871,6 +1873,7 @@ typedef enum VkDynamicState { VK_DYNAMIC_STATE_SHADING_RATE_IMAGE_ENABLE_NV = 1000455030, VK_DYNAMIC_STATE_REPRESENTATIVE_FRAGMENT_TEST_ENABLE_NV = 1000455031, VK_DYNAMIC_STATE_COVERAGE_REDUCTION_MODE_NV = 1000455032, + VK_DYNAMIC_STATE_ATTACHMENT_FEEDBACK_LOOP_ENABLE_EXT = 1000524000, VK_DYNAMIC_STATE_CULL_MODE_EXT = VK_DYNAMIC_STATE_CULL_MODE, VK_DYNAMIC_STATE_FRONT_FACE_EXT = VK_DYNAMIC_STATE_FRONT_FACE, VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT = VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY, @@ -9419,11 +9422,11 @@ typedef struct VkVideoDecodeH265SessionParametersCreateInfoKHR { } VkVideoDecodeH265SessionParametersCreateInfoKHR; typedef struct VkVideoDecodeH265PictureInfoKHR { - VkStructureType sType; - const void* pNext; - StdVideoDecodeH265PictureInfo* pStdPictureInfo; - uint32_t sliceSegmentCount; - const uint32_t* pSliceSegmentOffsets; + VkStructureType sType; + const void* pNext; + const StdVideoDecodeH265PictureInfo* pStdPictureInfo; + uint32_t sliceSegmentCount; + const uint32_t* pSliceSegmentOffsets; } VkVideoDecodeH265PictureInfoKHR; typedef struct VkVideoDecodeH265DpbSlotInfoKHR { @@ -10230,6 +10233,17 @@ VKAPI_ATTR void VKAPI_CALL vkGetDeviceImageSparseMemoryRequirementsKHR( #endif +#define VK_KHR_ray_tracing_position_fetch 1 +#define VK_KHR_RAY_TRACING_POSITION_FETCH_SPEC_VERSION 1 +#define VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME "VK_KHR_ray_tracing_position_fetch" +typedef struct VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 rayTracingPositionFetch; +} VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR; + + + #define VK_EXT_debug_report 1 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT) #define VK_EXT_DEBUG_REPORT_SPEC_VERSION 10 @@ -12090,6 +12104,7 @@ typedef enum VkBuildAccelerationStructureFlagBitsKHR { #ifdef VK_ENABLE_BETA_EXTENSIONS VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DISPLACEMENT_MICROMAP_UPDATE_NV = 0x00000200, #endif + VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR = 0x00000800, VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_NV = VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR, VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_NV = VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR, VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_NV = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR, @@ -14695,7 +14710,7 @@ typedef struct VkPhysicalDevice4444FormatsFeaturesEXT { #define VK_EXT_device_fault 1 -#define VK_EXT_DEVICE_FAULT_SPEC_VERSION 1 +#define VK_EXT_DEVICE_FAULT_SPEC_VERSION 2 #define VK_EXT_DEVICE_FAULT_EXTENSION_NAME "VK_EXT_device_fault" typedef enum VkDeviceFaultAddressTypeEXT { @@ -14759,6 +14774,8 @@ typedef struct VkDeviceFaultVendorBinaryHeaderVersionOneEXT { uint32_t applicationNameOffset; uint32_t applicationVersion; uint32_t engineNameOffset; + uint32_t engineVersion; + uint32_t apiVersion; } VkDeviceFaultVendorBinaryHeaderVersionOneEXT; typedef VkResult (VKAPI_PTR *PFN_vkGetDeviceFaultInfoEXT)(VkDevice device, VkDeviceFaultCountsEXT* pFaultCounts, VkDeviceFaultInfoEXT* pFaultInfo); @@ -15540,7 +15557,7 @@ VKAPI_ATTR void VKAPI_CALL vkGetMicromapBuildSizesEXT( #define VK_HUAWEI_cluster_culling_shader 1 -#define VK_HUAWEI_CLUSTER_CULLING_SHADER_SPEC_VERSION 1 +#define VK_HUAWEI_CLUSTER_CULLING_SHADER_SPEC_VERSION 2 #define VK_HUAWEI_CLUSTER_CULLING_SHADER_EXTENSION_NAME "VK_HUAWEI_cluster_culling_shader" typedef struct VkPhysicalDeviceClusterCullingShaderFeaturesHUAWEI { VkStructureType sType; @@ -16667,6 +16684,24 @@ typedef struct VkMultiviewPerViewRenderAreasRenderPassBeginInfoQCOM { +#define VK_EXT_attachment_feedback_loop_dynamic_state 1 +#define VK_EXT_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_SPEC_VERSION 1 +#define VK_EXT_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_EXTENSION_NAME "VK_EXT_attachment_feedback_loop_dynamic_state" +typedef struct VkPhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT { + VkStructureType sType; + void* pNext; + VkBool32 attachmentFeedbackLoopDynamicState; +} VkPhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT; + +typedef void (VKAPI_PTR *PFN_vkCmdSetAttachmentFeedbackLoopEnableEXT)(VkCommandBuffer commandBuffer, VkImageAspectFlags aspectMask); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkCmdSetAttachmentFeedbackLoopEnableEXT( + VkCommandBuffer commandBuffer, + VkImageAspectFlags aspectMask); +#endif + + #define VK_KHR_acceleration_structure 1 #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 13 #define VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME "VK_KHR_acceleration_structure" diff --git a/icd/api/include/log.h b/icd/api/include/log.h index 417fc8d5..9610cdef 100644 --- a/icd/api/include/log.h +++ b/icd/api/include/log.h @@ -63,13 +63,12 @@ static void AmdvlkLog( return; } - va_list argList; - va_start(argList, pFormatStr); - va_end(argList); - #if PAL_ENABLE_PRINTS_ASSERTS + va_list argList; Util::DbgPrintf(Util::DbgPrintCatMsgFile, Util::DbgPrintStyleNoPrefixNoCrLf, "%s-", LogTag[tagId]); + va_start(argList, pFormatStr); Util::DbgVPrintf(Util::DbgPrintCatMsgFile, Util::DbgPrintStyleNoPrefixNoCrLf, pFormatStr, argList); + va_end(argList); Util::DbgPrintf(Util::DbgPrintCatMsgFile, Util::DbgPrintStyleNoPrefixNoCrLf, "\n"); #endif } diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h index 6a4a1b37..962af3d2 100644 --- a/icd/api/include/pipeline_compiler.h +++ b/icd/api/include/pipeline_compiler.h @@ -140,6 +140,7 @@ class PipelineCompiler VkResult BuildShaderModule( const Device* pDevice, const VkShaderModuleCreateFlags flags, + const VkShaderModuleCreateFlags internalShaderFlags, size_t codeSize, const void* pCode, const bool adaptForFastLink, @@ -169,6 +170,7 @@ class PipelineCompiler VkResult CreateGraphicsShaderBinary( const Device* pDevice, + PipelineCache* pPipelineCache, const ShaderStage stage, GraphicsPipelineBinaryCreateInfo* pCreateInfo, ShaderModuleHandle* pModule); @@ -209,11 +211,6 @@ class PipelineCompiler PipelineMetadata* pBinaryMetadata, GraphicsPipelineBinaryCreateInfo* pCreateInfo); - static void SetPartialGraphicsPipelineBinaryInfo( - const ShaderModuleHandle* pShaderModuleHandle, - const ShaderStage stage, - GraphicsPipelineBinaryCreateInfo* pCreateInfo); - VkResult ConvertComputePipelineInfo( const Device* pDevice, const VkComputePipelineCreateInfo* pIn, diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 0340e8eb..76de0400 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -404,6 +404,7 @@ class CmdBuffer void BindIndexBuffer( VkBuffer buffer, VkDeviceSize offset, + VkDeviceSize size, VkIndexType indexType); void BindVertexBuffers( @@ -1050,7 +1051,8 @@ class CmdBuffer void PalCmdBindIndexData( Buffer* pBuffer, Pal::gpusize offset, - Pal::IndexType indexType); + Pal::IndexType indexType, + Pal::gpusize bufferSize); void PalCmdUnbindIndexData(Pal::IndexType indexType); @@ -1727,8 +1729,11 @@ class CmdBuffer GpuRt::DispatchRaysConstants* pConstants); void BindRayQueryConstants( - const Pipeline* pPipeline, - Pal::PipelineBindPoint bindPoint); + const Pipeline* pPipeline, + Pal::PipelineBindPoint bindPoint, + uint32_t width, + uint32_t height, + uint32_t depth); #endif union CmdBufferFlags diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h index 58ea36cd..8592b435 100644 --- a/icd/api/include/vk_conv.h +++ b/icd/api/include/vk_conv.h @@ -539,6 +539,7 @@ inline Pal::TexFilter VkToPalTexFilter( break; default: VK_NOT_IMPLEMENTED; + break; } const Pal::XyFilter pointFilter = (anisotropicEnabled != VK_FALSE) ? Pal::XyFilterAnisotropicPoint : @@ -555,6 +556,7 @@ inline Pal::TexFilter VkToPalTexFilter( break; default: VK_NOT_IMPLEMENTED; + break; } switch (minFilter) @@ -567,6 +569,7 @@ inline Pal::TexFilter VkToPalTexFilter( break; default: VK_NOT_IMPLEMENTED; + break; } return palTexFilter; @@ -1226,17 +1229,15 @@ inline void VkToPalSubresRange( uint32_t* pPalSubresRangeIndex, const RuntimeSettings& settings) { - constexpr uint32_t WHOLE_SIZE_UINT32 = (uint32_t)VK_WHOLE_SIZE; + // The minimums below are used for VkImageSubresourceRange VK_WHOLE_SIZE handling. Pal::SubresRange palSubresRange; palSubresRange.startSubres.arraySlice = range.baseArrayLayer; palSubresRange.startSubres.mipLevel = range.baseMipLevel; palSubresRange.numPlanes = 1; - palSubresRange.numMips = (range.levelCount == WHOLE_SIZE_UINT32) ? - (mipLevels - range.baseMipLevel) : range.levelCount; - palSubresRange.numSlices = (range.layerCount == WHOLE_SIZE_UINT32) ? - (arraySize - range.baseArrayLayer) : range.layerCount; + palSubresRange.numMips = Util::Min(range.levelCount, (mipLevels - range.baseMipLevel)); + palSubresRange.numSlices = Util::Min(range.layerCount, (arraySize - range.baseArrayLayer)); VkImageAspectFlags aspectMask = range.aspectMask; Pal::ChNumFormat palFormat = VkToPalFormat(format, settings).format; @@ -1251,7 +1252,8 @@ inline void VkToPalSubresRange( { palSubresRange.startSubres.plane = VkToPalImagePlaneExtract(palFormat, &aspectMask); pPalSubresRanges[(*pPalSubresRangeIndex)++] = palSubresRange; - } while (aspectMask != 0); + } + while (aspectMask != 0); } // ===================================================================================================================== @@ -1570,69 +1572,6 @@ void VkToPalImageScaledCopyRegion( while (aspectMask != 0); } -// ===================================================================================================================== -// Converts a Vulkan image-blit structure to one or more PAL color-space-conversion-region structures. -inline Pal::ColorSpaceConversionRegion VkToPalImageColorSpaceConversionRegion( - const VkImageBlit& imageBlit, - Pal::SwizzledFormat srcFormat, - Pal::SwizzledFormat dstFormat) -{ - - Pal::ColorSpaceConversionRegion region = {}; - - // Color conversion blits can only happen between a YUV and an RGB image. - VK_ASSERT((Pal::Formats::IsYuv(srcFormat.format) && (Pal::Formats::IsYuv(dstFormat.format) == false)) || - ((Pal::Formats::IsYuv(srcFormat.format) == false) && (Pal::Formats::IsYuv(dstFormat.format)))); - - const VkImageSubresourceLayers& rgbSubresource = - Pal::Formats::IsYuv(srcFormat.format) ? imageBlit.dstSubresource : imageBlit.srcSubresource; - - const VkImageSubresourceLayers& yuvSubresource = - Pal::Formats::IsYuv(srcFormat.format) ? imageBlit.srcSubresource : imageBlit.dstSubresource; - - // Convert values to temporary 3D variables as the PAL interface currently only accepts 2D - Pal::Offset3d srcOffset = VkToPalOffset3d(imageBlit.srcOffsets[0]); - Pal::SignedExtent3d srcExtent = VkToPalSignedExtent3d(imageBlit.srcOffsets); - Pal::Offset3d dstOffset = VkToPalOffset3d(imageBlit.dstOffsets[0]); - Pal::SignedExtent3d dstExtent = VkToPalSignedExtent3d(imageBlit.dstOffsets); - - region.rgbSubres.plane = 0; - region.rgbSubres.mipLevel = rgbSubresource.mipLevel; - region.rgbSubres.arraySlice = rgbSubresource.baseArrayLayer; - - VK_ASSERT(yuvSubresource.mipLevel == 0); - - region.yuvStartSlice = yuvSubresource.baseArrayLayer; - - VK_ASSERT(imageBlit.srcSubresource.layerCount == imageBlit.dstSubresource.layerCount); - VK_ASSERT(srcExtent.depth == dstExtent.depth); - - region.sliceCount = Util::Max(srcExtent.depth, imageBlit.srcSubresource.layerCount); - - // Write the 2D coordinates and ignore the 3rd dimension for now - region.srcOffset.x = srcOffset.x; - region.srcOffset.y = srcOffset.y; - - VK_ASSERT(srcOffset.z == 0); - - region.srcExtent.width = srcExtent.width; - region.srcExtent.height = srcExtent.height; - - VK_ASSERT(srcExtent.depth == 1); - - region.dstOffset.x = dstOffset.x; - region.dstOffset.y = dstOffset.y; - - VK_ASSERT(dstOffset.z == 0); - - region.dstExtent.width = dstExtent.width; - region.dstExtent.height = dstExtent.height; - - VK_ASSERT(dstExtent.depth == 1); - - return region; -} - // ===================================================================================================================== // Converts a Vulkan image-resolve structure to one or more PAL image-resolve-region structures. template @@ -3945,12 +3884,14 @@ struct UberFetchShaderFormatInfo // ===================================================================================================================== class UberFetchShaderFormatInfoMap : - public Util::HashMap + public Util::HashMap, 1024> { public: explicit UberFetchShaderFormatInfoMap(uint32 numBuckets, PalAllocator* const pAllocator) : - Util::HashMap(numBuckets, pAllocator), + Util::HashMap, 1024>(numBuckets, pAllocator), m_bufferFormatMask(0) { } diff --git a/icd/api/include/vk_deferred_operation.h b/icd/api/include/vk_deferred_operation.h index 3a7bd3f3..e4d37dee 100644 --- a/icd/api/include/vk_deferred_operation.h +++ b/icd/api/include/vk_deferred_operation.h @@ -57,7 +57,7 @@ typedef int32_t (*DeferredHostCallback)(Device* pDevice, struct DeferredWorkload { uint32_t nextInstance; // Next workload instance to execute - uint32_t completedInstances; // # of workload instances fully executed + uint32_t completedInstances; uint32_t totalInstances; // Actual # of workload instances (UINT_MAX if not yet known, 0 if no-op) uint32_t maxInstances; // Upper limit estimate of the # of instances (for when actual # is unavailable) void* pPayloads; // Array of payloads (per workload instance) diff --git a/icd/api/include/vk_defines.h b/icd/api/include/vk_defines.h index 2ee7d2d8..00a1182e 100644 --- a/icd/api/include/vk_defines.h +++ b/icd/api/include/vk_defines.h @@ -205,6 +205,8 @@ namespace vk PipelineCompilerTypeLlpc, // Use shader compiler provided by LLPC }; + // Point size must be set via gl_PointSize, otherwise it must be 1.0f + static const float DefaultPointSize = 1.0f; }// namespace vk #endif diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h index ad2b596e..5659c842 100644 --- a/icd/api/include/vk_device.h +++ b/icd/api/include/vk_device.h @@ -156,7 +156,12 @@ class Device uint32 dynamicPrimitiveTopologyUnrestricted : 1; uint32 graphicsPipelineLibrary : 1; uint32 deviceMemoryReport : 1; - uint32 reserved : 18; + uint32 initializePointSizeInBegin : 1; + uint32 deviceAddressBindingReport : 1; + // True if EXT_DEVICE_MEMORY_REPORT or EXT_DEVICE_ADDRESS_BINDING_REPORT is enabled. + uint32 gpuMemoryEventHandler : 1; + uint32 assumeDynamicTopologyInLibs : 1; + uint32 reserved : 14; }; uint32 u32All; @@ -784,7 +789,7 @@ class Device const uint8_t* pCode, uint32_t numUserDataNodes, Vkgc::ResourceMappingRootNode* pUserDataNodes, - VkShaderModuleCreateFlags flags, + VkShaderModuleCreateFlags internalShaderFlags, bool forceWave64, const VkSpecializationInfo* pSpecializationInfo, InternalPipeline* pInternalPipeline); diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index 4690bb44..8f77bc58 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -359,6 +359,7 @@ class DeviceExtensions final : public Extensions EXT_DEPTH_RANGE_UNRESTRICTED, EXT_DESCRIPTOR_BUFFER, EXT_DESCRIPTOR_INDEXING, + EXT_DEVICE_ADDRESS_BINDING_REPORT, EXT_DEVICE_FAULT, EXT_DEVICE_MEMORY_REPORT, EXT_EXTENDED_DYNAMIC_STATE, @@ -414,6 +415,7 @@ class DeviceExtensions final : public Extensions EXT_SHADER_VIEWPORT_INDEX_LAYER, EXT_SUBGROUP_SIZE_CONTROL, EXT_TEXEL_BUFFER_ALIGNMENT, + EXT_TEXTURE_COMPRESSION_ASTC_HDR, EXT_TOOLING_INFO, EXT_TRANSFORM_FEEDBACK, EXT_VERTEX_ATTRIBUTE_DIVISOR, diff --git a/icd/api/include/vk_graphics_pipeline_library.h b/icd/api/include/vk_graphics_pipeline_library.h index 4ff4428b..d0b715a8 100644 --- a/icd/api/include/vk_graphics_pipeline_library.h +++ b/icd/api/include/vk_graphics_pipeline_library.h @@ -87,6 +87,7 @@ class GraphicsPipelineLibrary final : public GraphicsPipelineCommon, public NonD static VkResult CreatePartialPipelineBinary( const Device* pDevice, + PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, const GraphicsPipelineLibraryInfo* pLibInfo, const GraphicsPipelineShaderStageInfo* pShaderStageInfo, diff --git a/icd/api/include/vk_physical_device.h b/icd/api/include/vk_physical_device.h index 928faedf..855a0e52 100644 --- a/icd/api/include/vk_physical_device.h +++ b/icd/api/include/vk_physical_device.h @@ -222,12 +222,6 @@ class PhysicalDevice return index; } - bool GetQueueGroupCompatible( - uint32_t queueFamilyIndex) const - { - return m_queueFamilies[queueFamilyIndex].flags.queueGroupCompatible; - } - Pal::EngineType GetQueueFamilyPalEngineType( uint32_t queueFamilyIndex) const { @@ -862,17 +856,6 @@ class PhysicalDevice VkShaderStageFlags validShaderStages; uint32_t palImageLayoutFlag; VkQueueFamilyProperties properties; - - union - { - struct - { - uint32_t queueGroupCompatible : 1; - uint32_t reserved : 31; - }; - uint32_t u32All; - } flags; - } m_queueFamilies[Queue::MaxQueueFamilies]; // List of indices for compute engines that aren't exclusive. diff --git a/icd/api/include/vk_pipeline.h b/icd/api/include/vk_pipeline.h index 230cd427..575da33b 100644 --- a/icd/api/include/vk_pipeline.h +++ b/icd/api/include/vk_pipeline.h @@ -70,12 +70,13 @@ struct ShaderModuleHandle; // enabled. struct PipelineBinaryInfo { - static PipelineBinaryInfo* Create(size_t size, const void* pBinary, const VkAllocationCallbacks* pAllocator); + static PipelineBinaryInfo* Create(Util::MetroHash::Hash hash, size_t size, const void* pBinary, const VkAllocationCallbacks* pAllocator); void Destroy(const VkAllocationCallbacks* pAllocator); size_t binaryByteSize; void* pBinary; + Util::MetroHash::Hash binaryHash; }; enum class DynamicStatesInternal : uint32_t diff --git a/icd/api/include/vk_pipeline_cache.h b/icd/api/include/vk_pipeline_cache.h index fc403078..2791dbd5 100644 --- a/icd/api/include/vk_pipeline_cache.h +++ b/icd/api/include/vk_pipeline_cache.h @@ -36,13 +36,6 @@ namespace vk class Device; -// Layout for pipeline cache private header, all fields are written with LSB first. -struct PipelineCachePrivateHeaderData -{ - PipelineCompilerType cacheType; // Cache data type - uint64_t blobSize[MaxPalDevices]; // Blob data size for each device -}; - // ===================================================================================================================== // Implementation of Vulkan pipeline cache object class PipelineCache final : public NonDispatchable @@ -60,12 +53,6 @@ class PipelineCache final : public NonDispatchablem_offset); } - if ((result == VK_SUCCESS) && - (m_pDevice->GetEnabledFeatures().deviceMemoryReport == true)) + if (result == VK_SUCCESS) { - // Sub-allocation succeeded, either from an existing pool, or a new pool. Report the allocation to - // device_memory_report. - Pal::IGpuMemory* pPalGpuMem = pInternalMemory->PalMemory(DefaultDeviceIndex); - auto*const pPhysicalDevice = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex); - - uint32_t heapIndex = 0; - bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pPalGpuMem->Desc().heaps[0], &heapIndex); - VK_ASSERT(validHeap); - - m_pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanSubAllocateEvent( - pPalGpuMem, - pInternalMemory->m_offset, - pInternalMemory->m_size, - requestingObjectHandle, - requestingObjectType, - heapIndex); + const Device::DeviceFeatures& deviceFeatures = m_pDevice->GetEnabledFeatures(); + if (deviceFeatures.gpuMemoryEventHandler) + { + // Sub-allocation succeeded, either from an existing pool, or a new pool. Report the allocation to + // GpuMemoryEventHandler. + Pal::IGpuMemory* pPalGpuMem = pInternalMemory->PalMemory(DefaultDeviceIndex); + auto* const pPhysicalDevice = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex); + + uint32_t heapIndex = 0; + bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pPalGpuMem->Desc().heaps[0], &heapIndex); + VK_ASSERT(validHeap); + + m_pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanSubAllocateEvent( + m_pDevice, + pPalGpuMem, + pInternalMemory->m_offset, + pInternalMemory->m_size, + requestingObjectHandle, + requestingObjectType, + heapIndex); + } } } } @@ -798,17 +802,20 @@ void InternalMemMgr::FreeGpuMem( if (pInternalMemory->m_memoryPool.pBuddyAllocator != nullptr) { + const Device::DeviceFeatures& deviceFeatures = m_pDevice->GetEnabledFeatures(); + // The memory was suballocated so free it using the buddy allocator pInternalMemory->m_memoryPool.pBuddyAllocator->Free( pInternalMemory->m_offset, pInternalMemory->m_size, pInternalMemory->m_alignment); - if (m_pDevice->GetEnabledFeatures().deviceMemoryReport == true) + if (deviceFeatures.gpuMemoryEventHandler) { Pal::IGpuMemory* pPalGpuMem = pInternalMemory->PalMemory(DefaultDeviceIndex); m_pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanSubFreeEvent( + m_pDevice, pPalGpuMem, pInternalMemory->m_offset); } diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp index 05c0cab1..15fb39fd 100644 --- a/icd/api/pipeline_compiler.cpp +++ b/icd/api/pipeline_compiler.cpp @@ -73,19 +73,21 @@ static bool IsDynamicStateEnabled(const uint64_t dynamicStateFlags, const Dynami // ===================================================================================================================== // Populates shaderLibrary input flags according to settings static uint32_t GpuRtShaderLibraryFlags( - const PhysicalDevice* pDevice) + const Device* pDevice) { - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + GpuRt::TraceRayCounterMode counterMode = pDevice->RayTrace()->TraceRayCounterMode(DefaultDeviceIndex); uint32_t flags = 0; - if ((settings.rtTraceRayCounterMode != TraceRayCounterDisable) || + if ((counterMode != GpuRt::TraceRayCounterMode::TraceRayCounterDisable) || (settings.rtTraceRayProfileFlags != TraceRayProfileDisable)) { flags |= static_cast(GpuRt::ShaderLibraryFeatureFlag::Developer); } - if (settings.emulatedRtIpLevel > HardwareRtIpLevel1_1) + if ((settings.emulatedRtIpLevel > HardwareRtIpLevel1_1) + ) { flags |= static_cast(GpuRt::ShaderLibraryFeatureFlag::SoftwareTraversal); } @@ -167,9 +169,8 @@ void PipelineCompiler::GetElfCacheMetricString( size_t outStrSize) { const int64_t freq = Util::GetPerfFrequency(); - - const int64_t avgUs = pCacheMatrix->totalBinaries > 0 ? - ((pCacheMatrix->totalTimeSpent / pCacheMatrix->totalBinaries) * 1000000) / freq : + const int64_t avgUs = (pCacheMatrix->totalBinaries + pCacheMatrix->cacheHits) > 0 ? + ((pCacheMatrix->totalTimeSpent / (pCacheMatrix->totalBinaries + pCacheMatrix->cacheHits)) * 1000000) / freq : 0; const double avgMs = avgUs / 1000.0; @@ -182,13 +183,13 @@ void PipelineCompiler::GetElfCacheMetricString( static constexpr char metricFmtString[] = "%s\n" - "Cache hit rate - %0.1f%%\n" - "Total request count - %d\n" + "Cache hit rate - %0.1f%% (%d/%d)\n" + "Total new binary - %d\n" "Total time spent - %0.1f ms\n" "Average time spent per request - %0.3f ms\n\n"; - Util::Snprintf(pOutStr, outStrSize, metricFmtString, - pPrefixStr, hitRate * 100, pCacheMatrix->totalBinaries, totalMs, avgMs); + Util::Snprintf(pOutStr, outStrSize, metricFmtString, pPrefixStr, hitRate * 100, pCacheMatrix->cacheHits, + pCacheMatrix->cacheAttempts, pCacheMatrix->totalBinaries, totalMs, avgMs); } // ===================================================================================================================== @@ -315,15 +316,11 @@ VkResult PipelineCompiler::Initialize() // This isn't a terminal failure, the device can continue without the pipeline cache if need be. VK_ALERT(m_pBinaryCache == nullptr); - if (m_pBinaryCache != nullptr) - { - pCacheAdapter = m_pBinaryCache->GetCacheAdapter(); - } } if (result == VK_SUCCESS) { - result = m_compilerSolutionLlpc.Initialize(m_gfxIp, info.gfxLevel, pCacheAdapter); + result = m_compilerSolutionLlpc.Initialize(m_gfxIp, info.gfxLevel, m_pBinaryCache); } if (result == VK_SUCCESS) @@ -369,7 +366,7 @@ void PipelineCompiler::Destroy() auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); Util::MutexAuto mutexLock(&m_cacheLock); - if (m_pPhysicalDevice->GetRuntimeSettings().enableEarlyCompile) + if (SupportInternalModuleCache(m_pPhysicalDevice, GetCompilerCollectionMask())) { for (auto it = m_shaderModuleHandleMap.Begin(); it.Get() != nullptr; it.Next()) { @@ -397,38 +394,6 @@ void PipelineCompiler::Destroy() m_uberFetchShaderInternalDataMap.Reset(); } -// ===================================================================================================================== -// Creates shader cache object. -VkResult PipelineCompiler::CreateShaderCache( - const void* pInitialData, - size_t initialDataSize, - uint32_t expectedEntries, - void* pShaderCacheMem, - ShaderCache* pShaderCache) -{ - VkResult result = VK_SUCCESS; - - return result; -} - -// ===================================================================================================================== -// Gets the size of shader cache object. -size_t PipelineCompiler::GetShaderCacheSize( - PipelineCompilerType cacheType) -{ - size_t shaderCacheSize = 0; - return shaderCacheSize; -} - -// ===================================================================================================================== -// Gets shader cache type. -PipelineCompilerType PipelineCompiler::GetShaderCacheType() -{ - PipelineCompilerType cacheType; - cacheType = PipelineCompilerTypeLlpc; - return cacheType; -} - // ===================================================================================================================== // Loads shader binary from replace shader folder with specified shader hash code. bool PipelineCompiler::LoadReplaceShaderBinary( @@ -589,6 +554,7 @@ void PipelineCompiler::StoreShaderModuleToCache( if ((pBinaryCache != nullptr) || supportInternalModuleCache) { + const Util::MetroHash::Hash shaderModuleCacheHash = GetShaderModuleCacheHash(flags, compilerMask, uniqueHash); @@ -628,6 +594,7 @@ void PipelineCompiler::StoreShaderModuleToCache( VkResult PipelineCompiler::BuildShaderModule( const Device* pDevice, const VkShaderModuleCreateFlags flags, + const VkShaderModuleCreateFlags internalShaderFlags, size_t codeSize, const void* pCode, const bool adaptForFastLink, @@ -670,15 +637,27 @@ VkResult PipelineCompiler::BuildShaderModule( result = LoadShaderModuleFromCache( pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pFeedback, pShaderModule); + VkShaderModuleCreateFlags internalFlags = internalShaderFlags; + if (result != VK_SUCCESS) { if (compilerMask & (1 << PipelineCompilerTypeLlpc)) { result = m_compilerSolutionLlpc.BuildShaderModule( - pDevice, flags, codeSize, pCode, adaptForFastLink, isInternal, pShaderModule, PipelineOptimizerKey{}); + pDevice, + flags, + internalFlags, + codeSize, + pCode, + adaptForFastLink, + isInternal, + pShaderModule, + PipelineOptimizerKey{}); } - StoreShaderModuleToCache(pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pShaderModule); + { + StoreShaderModuleToCache(pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pShaderModule); + } } else { @@ -744,12 +723,21 @@ void PipelineCompiler::FreeShaderModule( { m_compilerSolutionLlpc.FreeShaderModule(pShaderModule); auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + if (pShaderModule->elfPackage.codeSize > 0) + { + pInstance->FreeMem(const_cast(pShaderModule->elfPackage.pCode)); + } pInstance->FreeMem(pShaderModule->pRefCount); } } else { m_compilerSolutionLlpc.FreeShaderModule(pShaderModule); + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + if (pShaderModule->elfPackage.codeSize > 0) + { + pInstance->FreeMem(const_cast(pShaderModule->elfPackage.pCode)); + } } } @@ -815,7 +803,7 @@ bool PipelineCompiler::ReplacePipelineShaderModule( if (LoadReplaceShaderBinary(hash64, &codeSize, &pCode)) { VkResult result = - BuildShaderModule(pDevice, 0, codeSize, pCode, false, false, nullptr, nullptr, pShaderModule); + BuildShaderModule(pDevice, 0, 0, codeSize, pCode, false, false, nullptr, nullptr, pShaderModule); if (result == VK_SUCCESS) { @@ -1013,20 +1001,18 @@ Util::Result PipelineCompiler::GetCachedPipelineBinary( *pIsInternalCacheHit = true; } } + m_pipelineCacheMatrix.totalTimeSpent += Util::GetPerfCpuTime() - startTime; if (*pIsUserCacheHit || *pIsInternalCacheHit) { *pFreeCompilerBinary = FreeWithInstanceAllocator; cacheResult = Util::Result::Success; m_pipelineCacheMatrix.cacheHits++; + DumpCacheMatrix(m_pPhysicalDevice, + "Pipeline_runtime", + m_pipelineCacheMatrix.totalBinaries + m_pipelineCacheMatrix.cacheHits, + &m_pipelineCacheMatrix); } - m_pipelineCacheMatrix.totalTimeSpent += Util::GetPerfCpuTime() - startTime; - - DumpCacheMatrix(m_pPhysicalDevice, - "Pipeline_runtime", - m_pipelineCacheMatrix.totalBinaries + m_pipelineCacheMatrix.cacheHits, - &m_pipelineCacheMatrix); - return cacheResult; } @@ -1145,11 +1131,11 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; Vkgc::PipelineBuildInfo pipelineInfo = {}; - pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo; - uint64_t dumpHash = pipelineHash; - pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, - pipelineInfo, - dumpHash); + pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo; + uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : pipelineHash; + pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, + pipelineInfo, + dumpHash); } if (shouldCompile && (result == VK_SUCCESS)) @@ -1222,18 +1208,22 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( // Create ISA/relocatable shader for a specific shader based on pipeline information VkResult PipelineCompiler::CreateGraphicsShaderBinary( const Device* pDevice, + PipelineCache* pPipelineCache, const ShaderStage stage, GraphicsPipelineBinaryCreateInfo* pCreateInfo, ShaderModuleHandle* pModule) { VkResult result = VK_SUCCESS; + const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings(); const uint32_t compilerMask = GetCompilerCollectionMask(); - void* pPipelineDumpHandle = nullptr; + uint64_t libraryHash = Vkgc::IPipelineDumper::GetGraphicsShaderBinaryHash(&pCreateInfo->pipelineInfo, stage); + pCreateInfo->libraryHash[stage] = libraryHash; + void* pPipelineDumpHandle = nullptr; if (settings.enablePipelineDump) { - uint64_t dumpHash = Vkgc::IPipelineDumper::GetGraphicsShaderBinaryHash(&pCreateInfo->pipelineInfo, stage); + uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : libraryHash; Vkgc::PipelineDumpOptions dumpOptions = {}; dumpOptions.pDumpDir = settings.pipelineDumpDir; @@ -1252,6 +1242,7 @@ VkResult PipelineCompiler::CreateGraphicsShaderBinary( { result = m_compilerSolutionLlpc.CreateGraphicsShaderBinary( pDevice, + pPipelineCache, stage, pCreateInfo, pPipelineDumpHandle, @@ -1323,8 +1314,16 @@ VkResult PipelineCompiler::CreateComputePipelineBinary( } } } + if (shouldCompile) + { + if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && + (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) + { + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + } - if (settings.enablePipelineDump) + if (settings.enablePipelineDump && (result == VK_SUCCESS)) { Vkgc::PipelineDumpOptions dumpOptions = {}; dumpOptions.pDumpDir = settings.pipelineDumpDir; @@ -1334,37 +1333,30 @@ VkResult PipelineCompiler::CreateComputePipelineBinary( Vkgc::PipelineBuildInfo pipelineInfo = {}; pipelineInfo.pComputeInfo = &pCreateInfo->pipelineInfo; - pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, pipelineHash); + uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : pipelineHash; + pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, dumpHash); } - if (shouldCompile) + if (shouldCompile && (result == VK_SUCCESS)) { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) + if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc) { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + result = m_compilerSolutionLlpc.CreateComputePipelineBinary( + pDevice, + deviceIdx, + pPipelineCache, + pCreateInfo, + pPipelineBinarySize, + ppPipelineBinary, + pPipelineDumpHandle, + pipelineHash, + pCacheId, + &compileTime); } - else - { - if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc) - { - result = m_compilerSolutionLlpc.CreateComputePipelineBinary( - pDevice, - deviceIdx, - pPipelineCache, - pCreateInfo, - pPipelineBinarySize, - ppPipelineBinary, - pPipelineDumpHandle, - pipelineHash, - pCacheId, - &compileTime); - } - if (result == VK_SUCCESS) - { - pCreateInfo->freeCompilerBinary = FreeWithCompiler; - } + if (result == VK_SUCCESS) + { + pCreateInfo->freeCompilerBinary = FreeWithCompiler; } } @@ -1610,7 +1602,10 @@ static void CopyPipelineShadersInfo( if ((shaderMask & (1 << stage)) != 0) { *pShaderInfosDst[stage] = *pShaderInfosSrc[stage]; + pCreateInfo->earlyElfPackage[stage] = libInfo.earlyElfPackage[stage]; + pCreateInfo->earlyElfPackageHash[stage] = libInfo.earlyElfPackageHash[stage]; + pCreateInfo->libraryHash[stage] = libInfo.libraryHash[stage]; } } } @@ -2037,6 +2032,7 @@ static VkResult BuildPipelineResourceMapping( if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0)) { + pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash(); size_t genericMappingBufferSize = pLayout->GetPipelineInfo()->mappingBufferSize; @@ -2217,7 +2213,7 @@ static void BuildColorBlendState( VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; - if (pCb != nullptr) + if ((pCb != nullptr) && (i < pCb->attachmentCount)) { const VkPipelineColorBlendAttachmentState& src = pCb->pAttachments[i]; if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorWriteMask) == false) @@ -2289,7 +2285,14 @@ static void BuildColorBlendState( } } - if (pCb != nullptr) + if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEquation) || + IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEnable)) + { + { + dualSourceBlendEnabled = true; + } + } + else if (pCb != nullptr) { dualSourceBlendEnabled = GraphicsPipelineCommon::GetDualSourceBlendEnableState(pDevice, pCb); } @@ -2361,8 +2364,9 @@ static void BuildPreRasterizationShaderState( const RenderPass* pRenderPass = RenderPass::ObjectFromHandle(pIn->renderPass); bool isConservativeOverestimation = false; bool unrestrictedPrimitiveTopology = - IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::PrimitiveTopology) && - pDevice->GetEnabledFeatures().dynamicPrimitiveTopologyUnrestricted; + pDevice->GetEnabledFeatures().assumeDynamicTopologyInLibs || + (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::PrimitiveTopology) && + pDevice->GetEnabledFeatures().dynamicPrimitiveTopologyUnrestricted); BuildRasterizationState(pIn->pRasterizationState, dynamicStateFlags, &isConservativeOverestimation, pCreateInfo); @@ -2548,7 +2552,7 @@ static void BuildExecutablePipelineState( { pDefaultCompiler->SetRayTracingState(pDevice, &(pCreateInfo->pipelineInfo.rtState), 0); - uint32_t flags = GpuRtShaderLibraryFlags(pDevice->VkPhysicalDevice(DefaultDeviceIndex)); + uint32_t flags = GpuRtShaderLibraryFlags(pDevice); const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(flags); @@ -2692,13 +2696,16 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( pCreateInfo->pipelineInfo.enableUberFetchShader = false; } + if (libInfo.flags.isLibrary) + { + pCreateInfo->pipelineInfo.unlinked = true; + } if (libInfo.flags.isLibrary) { auto pPipelineBuildInfo = &pCreateInfo->pipelineInfo; pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput; auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); pPipelineBuildInfo->pInstance = pInstance; - pPipelineBuildInfo->unlinked = true; CompilerSolution::DisableNggCulling(&pPipelineBuildInfo->nggState); } @@ -2715,16 +2722,6 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( return result; } -// ===================================================================================================================== -// Fill partial pipeline binary info in GraphicsPipelineBinaryCreateInfo -void PipelineCompiler::SetPartialGraphicsPipelineBinaryInfo( - const ShaderModuleHandle* pShaderModuleHandle, - const ShaderStage stage, - GraphicsPipelineBinaryCreateInfo* pCreateInfo) -{ - pCreateInfo->earlyElfPackage[stage] = pShaderModuleHandle->elfPackage; -} - // ===================================================================================================================== // Checks which compiler is used template @@ -2814,6 +2811,8 @@ void PipelineCompiler::ApplyPipelineOptions( pOptions->threadGroupSwizzleMode = static_cast(settings.forceCsThreadGroupSwizzleMode); + pOptions->enableImplicitInvariantExports = (settings.disableImplicitInvariantExports == false); + pOptions->reverseThreadGroup = settings.enableAlternatingThreadGroupOrder; if (pDevice->GetEnabledFeatures().robustBufferAccessExtended) @@ -2888,6 +2887,8 @@ VkResult PipelineCompiler::ConvertComputePipelineInfo( } else { + pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash(); + pCreateInfo->pMappingBuffer = Util::VoidPtrInc(pCreateInfo->pTempBuffer, genericMappingBufferSize); // NOTE: Zero the allocated space that is used to create pipeline resource mappings. Some @@ -2932,7 +2933,7 @@ VkResult PipelineCompiler::ConvertComputePipelineInfo( if (pDevice->RayTrace() != nullptr) { - uint32_t flags = GpuRtShaderLibraryFlags(pDevice->VkPhysicalDevice(DefaultDeviceIndex)); + uint32_t flags = GpuRtShaderLibraryFlags(pDevice); const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(flags); VK_ASSERT(codePatch.dxilSize > 0); @@ -3251,6 +3252,8 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0)) { + pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash(); + pCreateInfo->pMappingBuffer = pCreateInfo->pTempBuffer; tempBufferOffset += pCreateInfo->mappingBufferSize; @@ -3350,7 +3353,7 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( pCreateInfo->allowShaderInlining = false; } - uint32_t flags = GpuRtShaderLibraryFlags(pDevice->VkPhysicalDevice(DefaultDeviceIndex)); + uint32_t flags = GpuRtShaderLibraryFlags(pDevice); const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(flags); VK_ASSERT(codePatch.dxilSize > 0); @@ -3434,7 +3437,17 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary( bool shaderModuleReplaced = false; - if (settings.enablePipelineDump) + if (shouldCompile) + { + if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && + (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) + { + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + + } + + if (settings.enablePipelineDump && (result == VK_SUCCESS)) { Vkgc::PipelineDumpOptions dumpOptions = {}; @@ -3446,8 +3459,9 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary( Vkgc::PipelineBuildInfo pipelineInfo = {}; pipelineInfo.pRayTracingInfo = &pCreateInfo->pipelineInfo; + uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : pipelineHash; pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump( - &dumpOptions, pipelineInfo, pipelineHash); + &dumpOptions, pipelineInfo, dumpHash); } uint32_t shaderCount = pCreateInfo->pipelineInfo.shaderCount; @@ -3498,44 +3512,36 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary( } } - if (shouldCompile) + if (shouldCompile && (result == VK_SUCCESS)) { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) - { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; - } - else + if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc) { - if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc) - { - int64_t startTime = Util::GetPerfCpuTime(); + int64_t startTime = Util::GetPerfCpuTime(); - // Build the LLPC pipeline - Llpc::RayTracingPipelineBuildOut pipelineOut = {}; + // Build the LLPC pipeline + Llpc::RayTracingPipelineBuildOut pipelineOut = {}; - // Fill pipeline create info for LLPC - pPipelineBuildInfo->pInstance = pInstance; - pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput; + // Fill pipeline create info for LLPC + pPipelineBuildInfo->pInstance = pInstance; + pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput; - result = m_compilerSolutionLlpc.CreateRayTracingPipelineBinary( - pDevice, - deviceIdx, - pPipelineCache, - pCreateInfo, - pPipelineBinary, - pPipelineDumpHandle, - pipelineHash, - pCacheId, - &compileTime); - - compileTime = Util::GetPerfCpuTime() - startTime; - } + result = m_compilerSolutionLlpc.CreateRayTracingPipelineBinary( + pDevice, + deviceIdx, + pPipelineCache, + pCreateInfo, + pPipelineBinary, + pPipelineDumpHandle, + pipelineHash, + pCacheId, + &compileTime); - if (result == VK_SUCCESS) - { - pCreateInfo->freeCompilerBinary = FreeWithCompiler; - } + compileTime = Util::GetPerfCpuTime() - startTime; + } + + if (result == VK_SUCCESS) + { + pCreateInfo->freeCompilerBinary = FreeWithCompiler; } } @@ -3705,7 +3711,8 @@ void PipelineCompiler::SetRayTracingState( &pRtState->staticPipelineFlags, &pRtState->triCompressMode, &pRtState->counterMode, - pRtState->pipelineFlags); + pRtState->pipelineFlags + ); // Set the indirect function calling convention and callee saved registers per shader type from settings pRtState->exportConfig.indirectCallingConvention = settings.indirectCallConvention; @@ -3760,6 +3767,10 @@ void PipelineCompiler::SetRayTracingState( pRtState->ldsSizePerThreadGroup = deviceProp.gfxipProperties.shaderCore.ldsSizePerThreadGroup; pRtState->maxRayLength = settings.rtMaxRayLength; + // Enables trace ray staticId and parentId handling (necessary for ray history dumps) + auto rtCounterMode = pDevice->RayTrace()->TraceRayCounterMode(DefaultDeviceIndex); + pRtState->enableRayTracingCounters = (rtCounterMode != GpuRt::TraceRayCounterMode::TraceRayCounterDisable); + #if VKI_BUILD_GFX11 // Enable hardware traversal stack on RTIP 2.0+ if (settings.emulatedRtIpLevel > EmulatedRtIpLevel1_1) diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp index e7e75890..06212a1f 100644 --- a/icd/api/raytrace/ray_tracing_device.cpp +++ b/icd/api/raytrace/ray_tracing_device.cpp @@ -27,6 +27,7 @@ #include "raytrace/ray_tracing_device.h" #include "raytrace/ray_tracing_util.h" #include "raytrace/vk_acceleration_structure.h" +#include "raytrace/vk_ray_tracing_pipeline.h" #include "include/vk_cmdbuffer.h" #include "include/vk_device.h" #include "include/vk_shader.h" @@ -98,6 +99,24 @@ VkResult RayTracingDevice::Init() initInfo.pAccelStructTracker = GetAccelStructTracker(deviceIdx); initInfo.accelStructTrackerGpuAddr = GetAccelStructTrackerGpuVa(deviceIdx); + initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::None; + switch (m_pDevice->GetRuntimeSettings().emulatedRtIpLevel) + { + case EmulatedRtIpLevelNone: + break; + case HardwareRtIpLevel1_1: + case EmulatedRtIpLevel1_1: + initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::RtIp1_1; + break; +#if VKI_BUILD_GFX11 + case EmulatedRtIpLevel2_0: + initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::RtIp2_0; + break; +#endif + default: + break; + } + GpuRt::ClientCallbacks callbacks = {}; callbacks.pfnInsertRGPMarker = &RayTracingDevice::ClientInsertRGPMarker; callbacks.pfnConvertAccelStructBuildGeometry = @@ -179,6 +198,7 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->bvhCpuBuildModeFastBuild = static_cast(settings.rtBvhCpuBuildMode); pDeviceSettings->enableTriangleSplitting = settings.rtEnableTriangleSplitting; pDeviceSettings->triangleSplittingFactor = settings.rtTriangleSplittingFactor; + pDeviceSettings->enableFusedInstanceNode = settings.enableFusedInstanceNode; pDeviceSettings->rebraidFactor = settings.rebraidFactor; pDeviceSettings->rebraidLengthPercentage = settings.rebraidLengthPercentage; pDeviceSettings->maxTopDownBuildInstances = settings.maxTopDownBuildInstances; @@ -261,6 +281,16 @@ bool RayTracingDevice::AccelStructTrackerEnabled( m_pGpuRtDevice[deviceIdx]->AccelStructTraceEnabled()); } +// ===================================================================================================================== +GpuRt::TraceRayCounterMode RayTracingDevice::TraceRayCounterMode( + uint32_t deviceIdx) const +{ + // If the PAL trace path is enabled, then force RayHistoryLight + return m_pGpuRtDevice[deviceIdx]->RayHistoryTraceAvailable() ? + GpuRt::TraceRayCounterMode::TraceRayCounterRayHistoryLight : + static_cast(m_pDevice->GetRuntimeSettings().rtTraceRayCounterMode); +} + // ===================================================================================================================== GpuRt::AccelStructTracker* RayTracingDevice::GetAccelStructTracker( uint32_t deviceIdx) const @@ -513,6 +543,122 @@ uint64_t RayTracingDevice::GetAccelerationStructureUUID( return static_cast(gfxip) << 32 | vk::utils::GetBuildTimeHash(); } +// ===================================================================================================================== +void RayTracingDevice::SetDispatchInfo( + GpuRt::RtPipelineType pipelineType, + uint32_t width, + uint32_t height, + uint32_t depth, + uint32_t shaderCount, + uint64_t apiHash, + const VkStridedDeviceAddressRegionKHR* pRaygenSbt, + const VkStridedDeviceAddressRegionKHR* pMissSbt, + const VkStridedDeviceAddressRegionKHR* pHitSbt, + GpuRt::RtDispatchInfo* pDispatchInfo) const +{ + const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); + GpuRt::RtDispatchInfo dispatchInfo = {}; + + dispatchInfo.dimX = width; + dispatchInfo.dimY = height; + dispatchInfo.dimZ = depth; + + dispatchInfo.pipelineShaderCount = shaderCount; + dispatchInfo.stateObjectHash = apiHash; + + dispatchInfo.boxSortMode = settings.boxSortingHeuristic; +#if VKI_BUILD_GFX11 + dispatchInfo.usesNodePtrFlags = settings.rtEnableNodePointerFlags ? 1 : 0; +#endif + + if (pipelineType == GpuRt::RtPipelineType::RayTracing) + { + dispatchInfo.raygenShaderTable.addr = static_cast(pRaygenSbt->deviceAddress); + dispatchInfo.raygenShaderTable.size = static_cast(pRaygenSbt->size); + dispatchInfo.raygenShaderTable.stride = static_cast(pRaygenSbt->stride); + + dispatchInfo.missShaderTable.addr = static_cast(pMissSbt->deviceAddress); + dispatchInfo.missShaderTable.size = static_cast(pMissSbt->size); + dispatchInfo.missShaderTable.stride = static_cast(pMissSbt->stride); + + dispatchInfo.hitGroupTable.addr = static_cast(pHitSbt->deviceAddress); + dispatchInfo.hitGroupTable.size = static_cast(pHitSbt->size); + dispatchInfo.hitGroupTable.stride = static_cast(pHitSbt->stride); + } + + (*pDispatchInfo) = dispatchInfo; +} + +// ===================================================================================================================== +void RayTracingDevice::TraceDispatch( + uint32_t deviceIdx, + Pal::ICmdBuffer* pPalCmdBuffer, + GpuRt::RtPipelineType pipelineType, + uint32_t width, + uint32_t height, + uint32_t depth, + uint32_t shaderCount, + uint64_t apiHash, + const VkStridedDeviceAddressRegionKHR* pRaygenSbt, + const VkStridedDeviceAddressRegionKHR* pMissSbt, + const VkStridedDeviceAddressRegionKHR* pHitSbt, + GpuRt::DispatchRaysConstants* pConstants) +{ + if (m_pGpuRtDevice[deviceIdx]->RayHistoryTraceActive()) + { + GpuRt::RtDispatchInfo dispatchInfo = {}; + SetDispatchInfo(pipelineType, + width, + height, + depth, + shaderCount, + apiHash, + pRaygenSbt, + pMissSbt, + pHitSbt, + &dispatchInfo); + + m_pGpuRtDevice[deviceIdx]->TraceRtDispatch(pPalCmdBuffer, + pipelineType, + dispatchInfo, + pConstants); + } +} + +// ===================================================================================================================== +void RayTracingDevice::TraceIndirectDispatch( + uint32_t deviceIdx, + GpuRt::RtPipelineType pipelineType, + uint32_t shaderCount, + uint64_t apiHash, + const VkStridedDeviceAddressRegionKHR* pRaygenSbt, + const VkStridedDeviceAddressRegionKHR* pMissSbt, + const VkStridedDeviceAddressRegionKHR* pHitSbt, + Pal::gpusize* pCounterMetadataVa, + GpuRt::InitExecuteIndirectConstants* pConstants) +{ + if (m_pGpuRtDevice[deviceIdx]->RayHistoryTraceActive()) + { + GpuRt::RtDispatchInfo dispatchInfo = {}; + SetDispatchInfo(pipelineType, + 0, + 0, + 0, + shaderCount, + apiHash, + pRaygenSbt, + pMissSbt, + pHitSbt, + &dispatchInfo); + + m_pGpuRtDevice[deviceIdx]->TraceIndirectRtDispatch(pipelineType, + dispatchInfo, + 1, + pCounterMetadataVa, + pConstants); + } +} + // ===================================================================================================================== // Compile one of gpurt's internal pipelines. Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( @@ -657,7 +803,7 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( static_cast(buildInfo.code.pSpvCode), buildInfo.nodeCount, nodes, - VK_SHADER_MODULE_RAY_TRACING_INTERNAL_SHADER_BIT, + VK_INTERNAL_SHADER_FLAGS_RAY_TRACING_INTERNAL_SHADER_BIT, forceWave64, &specializationInfo, &pDevice->GetInternalRayTracingPipeline()); diff --git a/icd/api/raytrace/ray_tracing_device.h b/icd/api/raytrace/ray_tracing_device.h index 9a0c27b3..f13ce5b9 100644 --- a/icd/api/raytrace/ray_tracing_device.h +++ b/icd/api/raytrace/ray_tracing_device.h @@ -85,6 +85,33 @@ class RayTracingDevice uint32_t GetProfileRayFlags() const { return m_profileRayFlags; } uint32_t GetProfileMaxIterations() const { return m_profileMaxIterations; } + GpuRt::TraceRayCounterMode TraceRayCounterMode(uint32_t deviceIdx) const; + + void TraceDispatch( + uint32_t deviceIdx, + Pal::ICmdBuffer* pPalCmdBuffer, + GpuRt::RtPipelineType pipelineType, + uint32_t width, + uint32_t height, + uint32_t depth, + uint32_t shaderCount, + uint64_t apiHash, + const VkStridedDeviceAddressRegionKHR* pRaygenSbt, + const VkStridedDeviceAddressRegionKHR* pMissSbt, + const VkStridedDeviceAddressRegionKHR* pHitSbt, + GpuRt::DispatchRaysConstants* pConstants); + + void TraceIndirectDispatch( + uint32_t deviceIdx, + GpuRt::RtPipelineType pipelineType, + uint32_t shaderCount, + uint64_t apiHash, + const VkStridedDeviceAddressRegionKHR* pRaygenSbt, + const VkStridedDeviceAddressRegionKHR* pMissSbt, + const VkStridedDeviceAddressRegionKHR* pHitSbt, + Pal::gpusize* pCounterMetadataVa, + GpuRt::InitExecuteIndirectConstants* pConstants); + private: Device* m_pDevice; @@ -145,6 +172,18 @@ class RayTracingDevice const GpuRt::DeviceInitInfo& initInfo, ClientGpuMemHandle gpuMem); + void SetDispatchInfo( + GpuRt::RtPipelineType pipelineType, + uint32_t width, + uint32_t height, + uint32_t depth, + uint32_t shaderCount, + uint64_t apiHash, + const VkStridedDeviceAddressRegionKHR* pRaygenSbt, + const VkStridedDeviceAddressRegionKHR* pMissSbt, + const VkStridedDeviceAddressRegionKHR* pHitSbt, + GpuRt::RtDispatchInfo* pDispatchInfo) const; + AccelStructTrackerResources m_accelStructTrackerResources[MaxPalDevices]; }; diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp index b8f7e8d2..f1b9ce98 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp @@ -463,6 +463,7 @@ VkResult RayTracingPipeline::CreateImpl( BuildApiHash(pCreateInfo, &elfHash, &apiPsoHash); binaryCreateInfo.pDeferredWorkload = pDeferredWorkload; + binaryCreateInfo.apiPsoHash = apiPsoHash; const VkPipelineCreationFeedbackCreateInfoEXT* pPipelineCreationFeedbackCreateInfo = nullptr; pDefaultCompiler->GetPipelineCreationFeedback(static_cast(pCreateInfo->pNext), @@ -1335,7 +1336,8 @@ VkResult RayTracingPipeline::CreateImpl( if ((result == VK_SUCCESS) && m_pDevice->IsExtensionEnabled(DeviceExtensions::AMD_SHADER_INFO)) { - pBinary = PipelineBinaryInfo::Create(pipelineBinary[DefaultDeviceIndex].pPipelineBins[0].codeSize, + pBinary = PipelineBinaryInfo::Create(cacheId[DefaultDeviceIndex], + pipelineBinary[DefaultDeviceIndex].pPipelineBins[0].codeSize, pipelineBinary[DefaultDeviceIndex].pPipelineBins[0].pCode, pAllocator); } @@ -1948,22 +1950,24 @@ void RayTracingPipeline::ConvertStaticPipelineFlags( uint32_t* pStaticFlags, uint32_t* pTriangleCompressMode, uint32_t* pCounterMode, - uint32_t pipelineFlags) + uint32_t pipelineFlags +) { - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + GpuRt::TraceRayCounterMode counterMode = pDevice->RayTrace()->TraceRayCounterMode(DefaultDeviceIndex); uint32_t staticFlags = pDevice->RayTrace()->GpuRt(DefaultDeviceIndex)->GetStaticPipelineFlags( Util::TestAnyFlagSet(pipelineFlags, VK_PIPELINE_CREATE_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR), Util::TestAnyFlagSet(pipelineFlags, VK_PIPELINE_CREATE_RAY_TRACING_SKIP_AABBS_BIT_KHR), settings.rtUseRayQueryForTraceRays, pDevice->RayTrace()->AccelStructTrackerEnabled(DefaultDeviceIndex), - (settings.rtTraceRayCounterMode != TraceRayCounterMode::TraceRayCounterDisable)); + (counterMode != GpuRt::TraceRayCounterMode::TraceRayCounterDisable)); *pStaticFlags = staticFlags; *pTriangleCompressMode = static_cast(ConvertGpuRtTriCompressMode(settings.rtTriangleCompressionMode)); - *pCounterMode = settings.rtTraceRayCounterMode; + *pCounterMode = static_cast(counterMode); } // ===================================================================================================================== @@ -2142,7 +2146,6 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetRayTracingShaderGroupHandlesKHR( { RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pipeline); - // #raytracing: MGPU support - Return based on DefaultDeviceIndex since the result shouldn't vary between GPUs. pPipeline->GetRayTracingShaderGroupHandles(DefaultDeviceIndex, firstGroup, groupCount, dataSize, pData); return VK_SUCCESS; diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.h b/icd/api/raytrace/vk_ray_tracing_pipeline.h index 6421297c..757e7830 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.h +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.h @@ -215,7 +215,8 @@ class RayTracingPipeline final : public Pipeline, public NonDispatchableGetSize() - offset); + uint32_t indexCount = 0; + if (bufferSize == VK_WHOLE_SIZE) + { + indexCount = utils::BufferSizeToIndexCount(indexType, pBuffer->GetSize() - offset); + } + else + { + indexCount = utils::BufferSizeToIndexCount(indexType, bufferSize - offset); + } utils::IterateMask deviceGroup(m_curDeviceMask); do @@ -1685,13 +1694,32 @@ VkResult CmdBuffer::Begin( Pal::GlobalScissorParams scissorParams = { }; scissorParams.scissorRegion.extent.width = limits.maxFramebufferWidth; scissorParams.scissorRegion.extent.height = limits.maxFramebufferHeight; - utils::IterateMask deviceGroup(GetDeviceMask()); - do { - const uint32_t deviceIdx = deviceGroup.Index(); - PalCmdBuffer(deviceIdx)->CmdSetGlobalScissor(scissorParams); + utils::IterateMask deviceGroup(GetDeviceMask()); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + PalCmdBuffer(deviceIdx)->CmdSetGlobalScissor(scissorParams); + } + while (deviceGroup.IterateNext()); + } + + if (m_pDevice->GetEnabledFeatures().initializePointSizeInBegin) + { + m_allGpuState.staticTokens.pointLineRasterState = DynamicRenderStateToken; + const Pal::PointLineRasterStateParams params = { DefaultPointSize, + 0.0f, // Default line width is zero + limits.pointSizeRange[0], + limits.pointSizeRange[1] }; + + utils::IterateMask deviceGroup(GetDeviceMask()); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + PalCmdBuffer(deviceIdx)->CmdSetPointLineRasterState(params); + } + while (deviceGroup.IterateNext()); } - while (deviceGroup.IterateNext()); const uint32_t supportedVrsRates = deviceProps.gfxipProperties.supportedVrsRates; @@ -2369,7 +2397,8 @@ void CmdBuffer::RebindUserData( userDataLayout.uberFetchConstBufRegBase, 2, reinterpret_cast(&gpuAddress)); - } while (deviceGroup.IterateNext()); + } + while (deviceGroup.IterateNext()); } } @@ -2998,6 +3027,7 @@ PFN_vkCmdPushDescriptorSetWithTemplateKHR CmdBuffer::GetCmdPushDescriptorSetWith void CmdBuffer::BindIndexBuffer( VkBuffer buffer, VkDeviceSize offset, + VkDeviceSize size, VkIndexType indexType) { DbgBarrierPreCmd(DbgBarrierBindIndexVertexBuffer); @@ -3007,7 +3037,7 @@ void CmdBuffer::BindIndexBuffer( if (pBuffer != NULL) { - PalCmdBindIndexData(pBuffer, offset, palIndexType); + PalCmdBindIndexData(pBuffer, offset, palIndexType, size); } else { @@ -3117,7 +3147,8 @@ void CmdBuffer::BindVertexBuffers( PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers( firstBinding, bindingCount, &PerGpuState(deviceIdx)->vbBindings[firstBinding]); - } while (deviceGroup.IterateNext()); + } + while (deviceGroup.IterateNext()); m_vbWatermark = Util::Max(m_vbWatermark, firstBinding + bindingCount); @@ -3194,7 +3225,7 @@ void CmdBuffer::Draw( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0); #endif { @@ -3221,7 +3252,7 @@ void CmdBuffer::DrawIndexed( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0); #endif { @@ -3251,7 +3282,7 @@ void CmdBuffer::DrawIndirect( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0); #endif Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); @@ -3309,7 +3340,7 @@ void CmdBuffer::DrawMeshTasks( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0); #endif PalCmdDrawMeshTasks(x, y, z); @@ -3332,7 +3363,7 @@ void CmdBuffer::DrawMeshTasksIndirect( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0); #endif PalCmdDrawMeshTasksIndirect(buffer, offset, count, stride, countBuffer, countOffset); @@ -3354,7 +3385,7 @@ void CmdBuffer::Dispatch( } #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute); + BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, x, y, z); #endif if (m_pDevice->GetRuntimeSettings().enableAlternatingThreadGroupOrder) @@ -3384,7 +3415,7 @@ void CmdBuffer::DispatchOffset( } #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute); + BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, dim_x, dim_y, dim_z); #endif PalCmdDispatchOffset(base_x, base_y, base_z, dim_x, dim_y, dim_z); @@ -4924,59 +4955,61 @@ void CmdBuffer::LoadOpClearColor( { // Get the image view from the attachment info const ImageView* const pImageView = ImageView::ObjectFromHandle(attachmentInfo.imageView); + if (pImageView != VK_NULL_HANDLE) + { + // Get the attachment image + const Image* pImage = pImageView->GetImage(); - // Get the attachment image - const Image* pImage = pImageView->GetImage(); - - // Convert the clear color to the format of the attachment view - Pal::SwizzledFormat clearFormat = VkToPalFormat( - pImageView->GetViewFormat(), - m_pDevice->GetRuntimeSettings()); - Pal::ClearColor clearColor = VkToPalClearColor( - attachmentInfo.clearValue.color, - clearFormat); + // Convert the clear color to the format of the attachment view + Pal::SwizzledFormat clearFormat = VkToPalFormat( + pImageView->GetViewFormat(), + m_pDevice->GetRuntimeSettings()); + Pal::ClearColor clearColor = VkToPalClearColor( + attachmentInfo.clearValue.color, + clearFormat); - // Get subres range from the image view - Pal::SubresRange subresRange = {}; - pImageView->GetFrameBufferAttachmentSubresRange(&subresRange); + // Get subres range from the image view + Pal::SubresRange subresRange = {}; + pImageView->GetFrameBufferAttachmentSubresRange(&subresRange); - // Override the number of slices with layerCount from pBeginRendering - subresRange.numSlices = pRenderingInfo->layerCount; + // Override the number of slices with layerCount from pBeginRendering + subresRange.numSlices = pRenderingInfo->layerCount; - const auto clearSubresRanges = LoadOpClearSubresRanges( - pRenderingInfo->viewMask, - subresRange); + const auto clearSubresRanges = LoadOpClearSubresRanges( + pRenderingInfo->viewMask, + subresRange); - // Clear Layout - const Pal::ImageLayout clearLayout = pImage->GetBarrierPolicy().GetAspectLayout( - attachmentInfo.imageLayout, - subresRange.startSubres.plane, - GetQueueFamilyIndex(), - pImage->GetFormat()); + // Clear Layout + const Pal::ImageLayout clearLayout = pImage->GetBarrierPolicy().GetAspectLayout( + attachmentInfo.imageLayout, + subresRange.startSubres.plane, + GetQueueFamilyIndex(), + pImage->GetFormat()); - utils::IterateMask deviceGroup(GetDeviceMask()); - - do - { - const uint32_t deviceIdx = deviceGroup.Index(); + utils::IterateMask deviceGroup(GetDeviceMask()); - // Clear Box - Pal::Box clearBox = BuildClearBox( - pDeviceGroupRenderArea[deviceIdx], - *pImageView); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); - PalCmdBuffer(deviceIdx)->CmdClearColorImage( - *pImage->PalImage(deviceIdx), - clearLayout, - clearColor, - clearFormat, - clearSubresRanges.NumElements(), - clearSubresRanges.Data(), - 1, - &clearBox, - Pal::ColorClearAutoSync); + // Clear Box + Pal::Box clearBox = BuildClearBox( + pDeviceGroupRenderArea[deviceIdx], + *pImageView); + + PalCmdBuffer(deviceIdx)->CmdClearColorImage( + *pImage->PalImage(deviceIdx), + clearLayout, + clearColor, + clearFormat, + clearSubresRanges.NumElements(), + clearSubresRanges.Data(), + 1, + &clearBox, + Pal::ColorClearAutoSync); + } + while (deviceGroup.IterateNext()); } - while (deviceGroup.IterateNext()); } } } @@ -5007,19 +5040,22 @@ void CmdBuffer::LoadOpClearDepthStencil( { const ImageView* const pStencilImageView = ImageView::ObjectFromHandle(pStencilAttachmentInfo->imageView); - pDepthStencilImage = pStencilImageView->GetImage(); + if (pStencilImageView != VK_NULL_HANDLE) + { + pDepthStencilImage = pStencilImageView->GetImage(); - GetImageLayout( - pStencilAttachmentInfo->imageView, - pStencilAttachmentInfo->imageLayout, - VK_IMAGE_ASPECT_STENCIL_BIT, - &subresRange, - &stencilLayout); + GetImageLayout( + pStencilAttachmentInfo->imageView, + pStencilAttachmentInfo->imageLayout, + VK_IMAGE_ASPECT_STENCIL_BIT, + &subresRange, + &stencilLayout); - if (pStencilAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) - { - clearSubresRanges.PushBack(subresRange); - clearStencil = pStencilAttachmentInfo->clearValue.depthStencil.stencil; + if (pStencilAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) + { + clearSubresRanges.PushBack(subresRange); + clearStencil = pStencilAttachmentInfo->clearValue.depthStencil.stencil; + } } } @@ -5028,19 +5064,22 @@ void CmdBuffer::LoadOpClearDepthStencil( { const ImageView* const pDepthImageView = ImageView::ObjectFromHandle(pDepthAttachmentInfo->imageView); - pDepthStencilImage = pDepthImageView->GetImage(); + if (pDepthImageView != VK_NULL_HANDLE) + { + pDepthStencilImage = pDepthImageView->GetImage(); - GetImageLayout( - pDepthAttachmentInfo->imageView, - pDepthAttachmentInfo->imageLayout, - VK_IMAGE_ASPECT_DEPTH_BIT, - &subresRange, - &depthLayout); + GetImageLayout( + pDepthAttachmentInfo->imageView, + pDepthAttachmentInfo->imageLayout, + VK_IMAGE_ASPECT_DEPTH_BIT, + &subresRange, + &depthLayout); - if (pDepthAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) - { - clearSubresRanges.PushBack(subresRange); - clearDepth = pDepthAttachmentInfo->clearValue.depthStencil.depth; + if (pDepthAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) + { + clearSubresRanges.PushBack(subresRange); + clearDepth = pDepthAttachmentInfo->clearValue.depthStencil.depth; + } } } else @@ -7033,7 +7072,8 @@ void CmdBuffer::ResetAccelerationStructureQueryPool( accelerationStructureQueryPool.GetSlotOffset(firstQuery), accelerationStructureQueryPool.GetSlotSize() * queryCount, 0); - } while (deviceGroup1.IterateNext()); + } + while (deviceGroup1.IterateNext()); // Wait for memory fill to complete { @@ -7831,7 +7871,8 @@ void CmdBuffer::QueryCopy( PalCmdBuffer(deviceIdx)->CmdRestoreComputeState(Pal::ComputeStatePipelineAndUserData); // Note that the application is responsible for doing a post-copy sync using a barrier. - } while (deviceGroup.IterateNext()); + } + while (deviceGroup.IterateNext()); } // ===================================================================================================================== @@ -7899,10 +7940,6 @@ void CmdBuffer::WriteTimestamp( void CmdBuffer::SetSampleLocations( const VkSampleLocationsInfoEXT* pSampleLocationsInfo) { - VK_ASSERT((m_allGpuState.pGraphicsPipeline != nullptr) && - (m_allGpuState.pGraphicsPipeline->ContainsStaticState( - DynamicStatesInternal::SampleLocations) == false)); - uint32_t sampleLocationsPerPixel = static_cast(pSampleLocationsInfo->sampleLocationsPerPixel); if (sampleLocationsPerPixel > 0) @@ -8433,26 +8470,21 @@ void CmdBuffer::RPSyncPointLegacy( const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr; - const uint32_t sampleCount = attachment.pImage->GetImageSamples(); - - if (sampleCount > 0) + if (attachment.pImage->IsSampleLocationsCompatibleDepth() && + tr.flags.isInitialLayoutTransition) { - if (attachment.pImage->IsSampleLocationsCompatibleDepth() && - tr.flags.isInitialLayoutTransition) - { - VK_ASSERT(attachment.pImage->HasDepth()); + VK_ASSERT(attachment.pImage->HasDepth()); - // Use the provided sample locations for this attachment if this is its - // initial layout transition - pQuadSamplePattern = - &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; - } - else - { - // Otherwise, use the subpass' sample locations - uint32_t subpass = m_renderPassInstance.subpass; - pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; - } + // Use the provided sample locations for this attachment if this is its + // initial layout transition + pQuadSamplePattern = + &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; + } + else + { + // Otherwise, use the subpass' sample locations + uint32_t subpass = m_renderPassInstance.subpass; + pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; } pLayoutTransition->imageInfo.pQuadSamplePattern = pQuadSamplePattern; @@ -8620,26 +8652,21 @@ void CmdBuffer::RPSyncPoint( const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr; - const uint32_t sampleCount = attachment.pImage->GetImageSamples(); + if (attachment.pImage->IsSampleLocationsCompatibleDepth() && + tr.flags.isInitialLayoutTransition) + { + VK_ASSERT(attachment.pImage->HasDepth()); - if (sampleCount > 0) + // Use the provided sample locations for this attachment if this is its + // initial layout transition + pQuadSamplePattern = + &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; + } + else { - if (attachment.pImage->IsSampleLocationsCompatibleDepth() && - tr.flags.isInitialLayoutTransition) - { - VK_ASSERT(attachment.pImage->HasDepth()); - - // Use the provided sample locations for this attachment if this is its - // initial layout transition - pQuadSamplePattern = - &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; - } - else - { - // Otherwise, use the subpass' sample locations - uint32_t subpass = m_renderPassInstance.subpass; - pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; - } + // Otherwise, use the subpass' sample locations + uint32_t subpass = m_renderPassInstance.subpass; + pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; } pPalTransitions[acquireReleaseInfo.imageBarrierCount].pQuadSamplePattern = pQuadSamplePattern; @@ -10110,10 +10137,9 @@ void CmdBuffer::SetLineWidth( { DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState); - constexpr float PointWidth = 1.0f; // gl_PointSize is arbitrary, elsewhere pointSize is 1.0 const VkPhysicalDeviceLimits& limits = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetLimits(); - const Pal::PointLineRasterStateParams params = { PointWidth, + const Pal::PointLineRasterStateParams params = { DefaultPointSize, lineWidth, limits.pointSizeRange[0], limits.pointSizeRange[1] }; @@ -10310,13 +10336,17 @@ void CmdBuffer::SetVertexInput( pUberFetchShaderInternalData = Util::VoidPtrInc(pUberFetchShaderInternalData, uberFetchShaderInternalDataSize); // Updat vertex buffer stride - uint32 firstChanged = UINT_MAX; - uint32 lastChanged = 0; + uint32_t firstChanged = UINT_MAX; + uint32_t lastChanged = 0; + uint32_t vertexBufferCount = 0; Pal::BufferViewInfo* pVbBindings = PerGpuState(deviceIdx)->vbBindings; - for (uint32 bindex = 0; bindex < vertexBindingDescriptionCount; ++bindex) + for (uint32_t bindex = 0; bindex < vertexBindingDescriptionCount; ++bindex) { - uint32 byteStride = pVertexBindingDescriptions[bindex].stride; - uint32 binding = pVertexBindingDescriptions[bindex].binding; + uint32_t byteStride = pVertexBindingDescriptions[bindex].stride; + uint32_t binding = pVertexBindingDescriptions[bindex].binding; + + vertexBufferCount = Util::Max(binding + 1, vertexBufferCount); + Pal::BufferViewInfo* pBinding = &pVbBindings[binding]; if (pBinding->stride != byteStride) @@ -10343,7 +10373,14 @@ void CmdBuffer::SetVertexInput( &PerGpuState(deviceIdx)->vbBindings[firstChanged]); } - } while (deviceGroup.IterateNext()); + if (vertexBufferCount != pBindState->dynamicBindInfo.gfx.dynamicState.vertexBufferCount) + { + pBindState->dynamicBindInfo.gfx.dynamicState.vertexBufferCount = vertexBufferCount; + m_allGpuState.dirtyGraphics.pipeline = 1; + } + + } + while (deviceGroup.IterateNext()); } } @@ -10623,7 +10660,7 @@ void CmdBuffer::DrawIndirectByteCount( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0); #endif utils::IterateMask deviceGroup(m_curDeviceMask); @@ -11142,7 +11179,22 @@ void CmdBuffer::GetRayTracingDispatchArgs( static_assert(uint32_t(GpuRt::TraceRayCounterDispatch) == uint32_t(TraceRayCounterDispatch), "Wrong enum value, TraceRayCounterDispatch != GpuRt::TraceRayCounterDispatch"); - pConstants->constData.counterMode = static_cast(settings.rtTraceRayCounterMode); + if (width > 0) + { + // Populate internalUavBufferSrd only for direct dispatches (where width, height, and depth are known) + m_pDevice->RayTrace()->TraceDispatch(deviceIdx, + PalCmdBuffer(deviceIdx), + GpuRt::RtPipelineType::RayTracing, + width, + height, + depth, + pPipeline->GetShaderGroupCount() + 1, + pPipeline->GetApiHash(), + &raygenSbt, + &missSbt, + &hitSbt, + pConstants); + } } @@ -11327,6 +11379,16 @@ void CmdBuffer::TraceRaysIndirectPerDevice( initUserData.outputBufferVa = pScratchMemory->GpuVirtAddr(deviceIdx); initUserData.outputConstantsVa = constants.descriptorTable.dispatchRaysConstGpuVa; + m_pDevice->RayTrace()->TraceIndirectDispatch(deviceIdx, + GpuRt::RtPipelineType::RayTracing, + pPipeline->GetShaderGroupCount() + 1, + pPipeline->GetApiHash(), + &raygenShaderBindingTable, + &missShaderBindingTable, + &hitShaderBindingTable, + &initUserData.outputCounterMetaVa, + pInitConstants); + m_pDevice->RayTrace()->GpuRt(deviceIdx)->InitExecuteIndirect(PalCmdBuffer(deviceIdx), initUserData, 1, 1); // Wait for the argument buffer to be populated before continuing with TraceRaysIndirect @@ -11474,7 +11536,10 @@ void CmdBuffer::SetRayTracingPipelineStackSize( // Setup internal constants and descriptors required for shaders using RayQuery void CmdBuffer::BindRayQueryConstants( const Pipeline* pPipeline, - Pal::PipelineBindPoint bindPoint) + Pal::PipelineBindPoint bindPoint, + uint32_t width, + uint32_t height, + uint32_t depth) { if ((pPipeline != nullptr) && pPipeline->HasRayTracing()) { @@ -11494,6 +11559,23 @@ void CmdBuffer::BindRayQueryConstants( VkDevice()->RayTrace()->GetAccelStructTrackerSrd(deviceIdx), sizeof(constants.descriptorTable.accelStructTrackerSrd)); + if (bindPoint == Pal::PipelineBindPoint::Compute) + { + // Ray history dumps for Graphics pipelines are not yet supported + m_pDevice->RayTrace()->TraceDispatch(deviceIdx, + PalCmdBuffer(deviceIdx), + GpuRt::RtPipelineType::Compute, + width, + height, + depth, + 1, + pPipeline->GetApiHash(), + nullptr, + nullptr, + nullptr, + &constants); + } + Pal::ICmdBuffer* pPalCmdBuffer = PalCmdBuffer(deviceIdx); Pal::gpusize constGpuAddr = 0; @@ -11622,7 +11704,8 @@ void CmdBuffer::BindDescriptorBufferEmbeddedSamplers( PerGpuState(deviceIdx)->setBindingData[apiBindPoint][setLayoutInfo.setPtrRegOffset] = static_cast(gpuAddr); - } while (deviceGroup.IterateNext()); + } + while (deviceGroup.IterateNext()); SetUserDataPipelineLayout(set, 1, pLayout, palBindPoint, apiBindPoint); } @@ -11642,6 +11725,67 @@ void CmdBuffer::ValidateGraphicsStates() { const uint32_t deviceIdx = deviceGroup.Index(); + if (m_allGpuState.dirtyGraphics.colorBlend) + { + DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState); + + RenderStateCache* pRSCache = m_pDevice->GetRenderStateCache(); + + if (pColorBlend == nullptr) + { + DynamicColorBlend colorBlend = {}; + + pRSCache->CreateColorBlendState(m_allGpuState.colorBlendCreateInfo, + m_pDevice->VkInstance()->GetAllocCallbacks(), + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT, + colorBlend.pPalColorBlend); + + // Check if pPalColorBlend is already in the m_palColorBlendState, destroy it and use the old one + // if yes.The destroy is not expensive since it's just a refCount--. + for (uint32_t i = 0; i < m_palColorBlendState.NumElements(); ++i) + { + const DynamicColorBlend& palColorBlendState = m_palColorBlendState.At(i); + + // Check device0 only should be sufficient + if (palColorBlendState.pPalColorBlend[0] == colorBlend.pPalColorBlend[0]) + { + pRSCache->DestroyColorBlendState(colorBlend.pPalColorBlend, + m_pDevice->VkInstance()->GetAllocCallbacks()); + + pColorBlend = &palColorBlendState; + break; + } + } + + // Add it to the m_palColorBlendState if it doesn't exist + if (pColorBlend == nullptr) + { + m_palColorBlendState.PushBack(colorBlend); + pColorBlend = &m_palColorBlendState.Back(); + } + } + + VK_ASSERT(pColorBlend != nullptr); + + PalCmdBindColorBlendState( + m_pPalCmdBuffers[deviceIdx], + deviceIdx, + pColorBlend->pPalColorBlend[deviceIdx]); + + bool dualSourceBlendEnable = m_pDevice->PalDevice(DefaultDeviceIndex)->CanEnableDualSourceBlend( + m_allGpuState.colorBlendCreateInfo); + + auto pDynamicState = + &m_allGpuState.pipelineState[PipelineBindGraphics].dynamicBindInfo.gfx.dynamicState; + if (dualSourceBlendEnable != pDynamicState->dualSourceBlendEnable) + { + pDynamicState->dualSourceBlendEnable = dualSourceBlendEnable; + m_allGpuState.dirtyGraphics.pipeline = 1; + } + + DbgBarrierPostCmd(DbgBarrierSetDynamicPipelineState); + } + if (m_allGpuState.dirtyGraphics.pipeline) { const GraphicsPipeline* pGraphicsPipeline = m_allGpuState.pGraphicsPipeline; @@ -11795,56 +11939,6 @@ void CmdBuffer::ValidateGraphicsStates() *Device::GetDefaultQuadSamplePattern(m_allGpuState.samplePattern.sampleCount)); } - if (m_allGpuState.dirtyGraphics.colorBlend) - { - DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState); - - RenderStateCache* pRSCache = m_pDevice->GetRenderStateCache(); - - if (pColorBlend == nullptr) - { - DynamicColorBlend colorBlend = {}; - - pRSCache->CreateColorBlendState(m_allGpuState.colorBlendCreateInfo, - m_pDevice->VkInstance()->GetAllocCallbacks(), - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT, - colorBlend.pPalColorBlend); - - // Check if pPalColorBlend is already in the m_palColorBlendState, destroy it and use the old one - // if yes.The destroy is not expensive since it's just a refCount--. - for (uint32_t i = 0; i < m_palColorBlendState.NumElements(); ++i) - { - const DynamicColorBlend& palColorBlendState = m_palColorBlendState.At(i); - - // Check device0 only should be sufficient - if (palColorBlendState.pPalColorBlend[0] == colorBlend.pPalColorBlend[0]) - { - pRSCache->DestroyColorBlendState(colorBlend.pPalColorBlend, - m_pDevice->VkInstance()->GetAllocCallbacks()); - - pColorBlend = &palColorBlendState; - break; - } - } - - // Add it to the m_palColorBlendState if it doesn't exist - if (pColorBlend == nullptr) - { - m_palColorBlendState.PushBack(colorBlend); - pColorBlend = &m_palColorBlendState.Back(); - } - } - - VK_ASSERT(pColorBlend != nullptr); - - PalCmdBindColorBlendState( - m_pPalCmdBuffers[deviceIdx], - deviceIdx, - pColorBlend->pPalColorBlend[deviceIdx]); - - DbgBarrierPostCmd(DbgBarrierSetDynamicPipelineState); - } - if (m_allGpuState.dirtyGraphics.msaa) { DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState); @@ -11916,7 +12010,7 @@ void CmdBuffer::ValidateSamplePattern( { const Pal::MsaaQuadSamplePattern* pLocations; - if (pSamplePattern != nullptr && (pSamplePattern->sampleCount > 0)) + if ((pSamplePattern != nullptr) && (pSamplePattern->sampleCount > 0)) { VK_ASSERT(sampleCount == pSamplePattern->sampleCount); @@ -12202,28 +12296,6 @@ void CmdBuffer::SetColorBlendEnable( } } -// ===================================================================================================================== -// Returns true if the given VkBlendFactor factor is a dual source blend factor -static bool IsDualSourceBlend( - VkBlendFactor blend) -{ - bool needDualSource = false; - switch (blend) - { - case VK_BLEND_FACTOR_SRC1_COLOR: - case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: - case VK_BLEND_FACTOR_SRC1_ALPHA: - case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: - needDualSource = true; - break; - default: - needDualSource = false; - break; - } - - return needDualSource; -} - // ===================================================================================================================== void CmdBuffer::SetColorBlendEquation( uint32_t firstAttachment, @@ -12232,8 +12304,6 @@ void CmdBuffer::SetColorBlendEquation( { uint32_t lastAttachment = Util::Min(firstAttachment + attachmentCount, Pal::MaxColorTargets); - bool dualSourceBlendEnable = false; - for (uint32_t i = firstAttachment; i < lastAttachment; i++) { const VkColorBlendEquationEXT& colorBlendEquation = pColorBlendEquations[i - firstAttachment]; @@ -12252,7 +12322,6 @@ void CmdBuffer::SetColorBlendEquation( (pTarget->srcBlendAlpha != srcBlendAlpha) || (pTarget->dstBlendAlpha != dstBlendAlpha) || (pTarget->blendFuncAlpha != blendFuncAlpha)) - { pTarget->srcBlendColor = srcBlendColor; pTarget->dstBlendColor = dstBlendColor; @@ -12261,22 +12330,6 @@ void CmdBuffer::SetColorBlendEquation( pTarget->dstBlendAlpha = dstBlendAlpha; pTarget->blendFuncAlpha = blendFuncAlpha; m_allGpuState.dirtyGraphics.colorBlend = 1; - - // Dual source blend only support color attachment 0 - if (i == 0) - { - dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.srcColorBlendFactor); - dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.dstColorBlendFactor); - dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.srcAlphaBlendFactor); - dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.dstAlphaBlendFactor); - auto pDynamicState = - &m_allGpuState.pipelineState[PipelineBindGraphics].dynamicBindInfo.gfx.dynamicState; - if (dualSourceBlendEnable != pDynamicState->dualSourceBlendEnable) - { - pDynamicState->dualSourceBlendEnable = dualSourceBlendEnable; - m_allGpuState.dirtyGraphics.pipeline = 1; - } - } } } } @@ -12585,7 +12638,8 @@ void CmdBuffer::SetDepthClipNegativeOneToOne( do { PerGpuState(deviceGroup.Index())->viewport.depthRange = depthRange; - } while (deviceGroup.IterateNext()); + } + while (deviceGroup.IterateNext()); m_allGpuState.dirtyGraphics.viewport = 1; m_allGpuState.staticTokens.viewports = DynamicRenderStateToken; diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index 2d7f4b04..5419d85a 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -185,6 +185,8 @@ VkResult ComputePipeline::Create( uint64_t apiPsoHash = {}; BuildApiHash(pCreateInfo, shaderInfo, &elfHash, &apiPsoHash); + binaryCreateInfo.apiPsoHash = apiPsoHash; + const VkPipelineCreationFeedbackCreateInfoEXT* pPipelineCreationFeedbackCreateInfo = nullptr; pDefaultCompiler->GetPipelineCreationFeedback(static_cast(pCreateInfo->pNext), &pPipelineCreationFeedbackCreateInfo); @@ -408,6 +410,7 @@ VkResult ComputePipeline::Create( (result == VK_SUCCESS)) { pBinary = PipelineBinaryInfo::Create( + cacheId[DefaultDeviceIndex], pipelineBinarySizes[DefaultDeviceIndex], pPipelineBinaries[DefaultDeviceIndex], pAllocator); @@ -490,6 +493,8 @@ VkResult ComputePipeline::Create( if (result == VK_SUCCESS) { + const Device::DeviceFeatures& deviceFeatures = pDevice->GetEnabledFeatures(); + uint64_t durationTicks = Util::GetPerfCpuTime() - startTimeTicks; uint64_t duration = vk::utils::TicksToNano(durationTicks); @@ -503,7 +508,7 @@ VkResult ComputePipeline::Create( &binaryCreateInfo.pipelineFeedback, &binaryCreateInfo.stageFeedback); - if (pDevice->GetEnabledFeatures().deviceMemoryReport == true) + if (deviceFeatures.gpuMemoryEventHandler) { size_t numEntries = 0; Util::Vector palSubAllocInfos(pDevice->VkInstance()->Allocator()); @@ -519,6 +524,7 @@ VkResult ComputePipeline::Create( { // Report the Pal suballocation for this pipeline to device_memory_report pDevice->VkInstance()->GetGpuMemoryEventHandler()->ReportDeferredPalSubAlloc( + pDevice, palSubAllocInfos[i].address, palSubAllocInfos[i].offset, ComputePipeline::IntValueFromHandle(*pPipeline), diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index e510b4ff..a6aac8a0 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -146,6 +146,10 @@ #define PalFmt_ASTC(w, h, numfmt) \ PalFmt(Pal::ChNumFormat::AstcLdr##w##x##h##_##numfmt, PalFmtX, PalFmtY, PalFmtZ, PalFmtW) +// For VK_FORMAT_ASTC_{w}x{h}_SFLOAT_BLOCK_EXT: +#define PalFmt_ASTC_HDR(w, h, numfmt) \ + PalFmt(Pal::ChNumFormat::AstcHdr##w##x##h##_##numfmt, PalFmtX, PalFmtY, PalFmtZ, PalFmtW) + // For VK_FORMAT_B{b}G{g}R{r}A{a}_{numfmt}_PACKn: #define PalFmt_BGRA_PACK(b, g, r, a, numfmt) \ PalFmt(Pal::ChNumFormat::X##a##Y##r##Z##g##W##b##_##numfmt, PalFmtY, PalFmtZ, PalFmtW, PalFmtX) @@ -322,6 +326,20 @@ VK_TO_PAL_STRUC_X( FORMAT_ASTC_12x10_UNORM_BLOCK, PalFmt_ASTC(12 VK_TO_PAL_STRUC_X( FORMAT_ASTC_12x10_SRGB_BLOCK, PalFmt_ASTC(12, 10, Srgb)) VK_TO_PAL_STRUC_X( FORMAT_ASTC_12x12_UNORM_BLOCK, PalFmt_ASTC(12, 12, Unorm)) VK_TO_PAL_STRUC_X( FORMAT_ASTC_12x12_SRGB_BLOCK, PalFmt_ASTC(12, 12, Srgb)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_4x4_SFLOAT_BLOCK, PalFmt_ASTC_HDR(4, 4, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_5x4_SFLOAT_BLOCK, PalFmt_ASTC_HDR(5, 4, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_5x5_SFLOAT_BLOCK, PalFmt_ASTC_HDR(5, 5, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_6x5_SFLOAT_BLOCK, PalFmt_ASTC_HDR(6, 5, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_6x6_SFLOAT_BLOCK, PalFmt_ASTC_HDR(6, 6, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_8x5_SFLOAT_BLOCK, PalFmt_ASTC_HDR(8, 5, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_8x6_SFLOAT_BLOCK, PalFmt_ASTC_HDR(8, 6, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_8x8_SFLOAT_BLOCK, PalFmt_ASTC_HDR(8, 8, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_10x5_SFLOAT_BLOCK, PalFmt_ASTC_HDR(10, 5, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_10x6_SFLOAT_BLOCK, PalFmt_ASTC_HDR(10, 6, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_10x8_SFLOAT_BLOCK, PalFmt_ASTC_HDR(10, 8, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_10x10_SFLOAT_BLOCK, PalFmt_ASTC_HDR(10, 10, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_12x10_SFLOAT_BLOCK, PalFmt_ASTC_HDR(12, 10, Float)) +VK_TO_PAL_STRUC_X( FORMAT_ASTC_12x12_SFLOAT_BLOCK, PalFmt_ASTC_HDR(12, 12, Float)) VK_TO_PAL_STRUC_X( FORMAT_B4G4R4A4_UNORM_PACK16, PalFmt_BGRA_PACK(4, 4, 4, 4, Unorm)) VK_TO_PAL_STRUC_X( FORMAT_B5G5R5A1_UNORM_PACK16, PalFmt_BGRA_PACK(5, 5, 5, 1, Unorm)) VK_TO_PAL_STRUC_X( FORMAT_B5G6R5_UNORM_PACK16, PalFmt_BGR_PACK(5, 6, 5, Unorm)) diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp index de9df969..5671c88d 100644 --- a/icd/api/vk_descriptor_pool.cpp +++ b/icd/api/vk_descriptor_pool.cpp @@ -159,9 +159,8 @@ VkResult DescriptorPool::Init( if (m_pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead) { - m_addresses[deviceIdx].fmaskCpuAddr = static_cast(m_pHostOnlyMemory); - static_cast(Util::VoidPtrInc(m_pHostOnlyMemory, - memReqs.size * numPalDevices + memReqs.size * deviceIdx)); + m_addresses[deviceIdx].fmaskCpuAddr = static_cast(Util::VoidPtrInc(m_pHostOnlyMemory, + memReqs.size * numPalDevices + memReqs.size * deviceIdx)); } } diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp index f7a6bbc8..4b589b91 100644 --- a/icd/api/vk_descriptor_set.cpp +++ b/icd/api/vk_descriptor_set.cpp @@ -795,18 +795,29 @@ void DescriptorUpdate::CopyDescriptorSets( VK_ASSERT(destBinding.sta.dwArrayStride > 0); VK_ASSERT(srcBinding.sta.dwArrayStride > 0); uint32_t* pSrcAddr = pSrcSet->StaticCpuAddress(deviceIdx) + srcBinding.sta.dwOffset - + params.srcArrayElement * srcBinding.sta.dwArrayStride * sizeof(uint32_t); + + params.srcArrayElement * srcBinding.sta.dwArrayStride; uint32_t* pDestAddr = pDestSet->StaticCpuAddress(deviceIdx) + destBinding.sta.dwOffset - + params.dstArrayElement * destBinding.sta.dwArrayStride * sizeof(uint32_t); + + params.dstArrayElement * destBinding.sta.dwArrayStride; - for (uint32_t j = 0; j < count; ++j) + if (srcBinding.sta.dwArrayStride == destBinding.sta.dwArrayStride) + { + // Source and destination have the same memory layout of array elements. + memcpy(pDestAddr, pSrcAddr, srcBinding.sta.dwArrayStride * sizeof(uint32_t) * count); + } + else { - uint32_t dstSizeInDw = destBinding.sta.dwArrayStride; - uint32_t srcSizeInDw = srcBinding.sta.dwArrayStride; - memcpy(pDestAddr + j * dstSizeInDw, pSrcAddr + j * srcSizeInDw, - Util::Min(destBinding.sta.dwArrayStride * sizeof(uint32_t), - srcBinding.sta.dwArrayStride * sizeof(uint32_t))); + const auto arrayElementSize = Util::Min( + destBinding.sta.dwArrayStride * sizeof(uint32_t), + srcBinding.sta.dwArrayStride * sizeof(uint32_t)); + + for (uint32_t j = 0; j < count; ++j) + { + memcpy( + pDestAddr + j * destBinding.sta.dwArrayStride, + pSrcAddr + j * srcBinding.sta.dwArrayStride, + arrayElementSize); + } } } else if ((srcBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) || diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 9ed730d4..9d9e0454 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -376,14 +376,6 @@ VkResult Device::Create( enabledDeviceExtensions.IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1) == false); } - if (enabledDeviceExtensions.IsExtensionEnabled(DeviceExtensions::EXT_EXTENDED_DYNAMIC_STATE3)) - { - if (pPhysicalDevice->GetRuntimeSettings().dynamicPrimitiveTopologyUnrestricted) - { - deviceFeatures.dynamicPrimitiveTopologyUnrestricted = true; - } - } - uint32_t numDevices = 1; PhysicalDevice* pPhysicalDevices[MaxPalDevices] = { pPhysicalDevice }; Pal::IDevice* pPalDevices[MaxPalDevices] = { pPhysicalDevice->PalDevice() }; @@ -641,6 +633,30 @@ VkResult Device::Create( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ADDRESS_BINDING_REPORT_FEATURES_EXT: + { + const VkPhysicalDeviceAddressBindingReportFeaturesEXT* pAddressBindingReportFeaturesEXT = + reinterpret_cast(pHeader); + + if (pAddressBindingReportFeaturesEXT->reportAddressBinding) + { + deviceFeatures.deviceAddressBindingReport = true; + deviceFeatures.gpuMemoryEventHandler = true; + + uint32 enabledCallbacks = pInstance->PalPlatform()->GetEnabledCallbackTypes(); + + enabledCallbacks |= 1 << static_cast(Pal::Developer::CallbackType::AllocGpuMemory); + enabledCallbacks |= 1 << static_cast(Pal::Developer::CallbackType::FreeGpuMemory); + enabledCallbacks |= 1 << static_cast(Pal::Developer::CallbackType::SubAllocGpuMemory); + enabledCallbacks |= 1 << static_cast(Pal::Developer::CallbackType::SubFreeGpuMemory); + enabledCallbacks |= 1 << static_cast(Pal::Developer::CallbackType::BindGpuMemory); + + pInstance->PalPlatform()->SetEnabledCallbackTypes(enabledCallbacks); + } + + break; + } + default: break; } @@ -689,6 +705,15 @@ VkResult Device::Create( deviceFeatures.mustWriteImmutableSamplers = false; } + if (enabledDeviceExtensions.IsExtensionEnabled(DeviceExtensions::EXT_EXTENDED_DYNAMIC_STATE3)) + { + if (pPhysicalDevice->GetRuntimeSettings().dynamicPrimitiveTopologyUnrestricted) + { + deviceFeatures.dynamicPrimitiveTopologyUnrestricted = true; + deviceFeatures.assumeDynamicTopologyInLibs = deviceFeatures.graphicsPipelineLibrary; + } + } + if ((pPhysicalDevice->GetRuntimeSettings().strictImageSizeRequirements == StrictImageSizeOn) || ((pPhysicalDevice->GetRuntimeSettings().strictImageSizeRequirements == StrictImageSizeAppControlled) && maintenance4Enabled)) @@ -804,9 +829,8 @@ VkResult Device::Create( case VK_STRUCTURE_TYPE_DEVICE_DEVICE_MEMORY_REPORT_CREATE_INFO_EXT: { - deviceFeatures.deviceMemoryReport = true; - - pInstance->GetGpuMemoryEventHandler()->EnableGpuMemoryEvents(); + deviceFeatures.deviceMemoryReport = true; + deviceFeatures.gpuMemoryEventHandler = true; uint32 enabledCallbacks = pInstance->PalPlatform()->GetEnabledCallbackTypes(); @@ -878,8 +902,7 @@ VkResult Device::Create( if (vkResult == VK_SUCCESS) { pMemory = pInstance->AllocMem( - privateDataSize + apiDeviceSize - , + privateDataSize + apiDeviceSize, VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -918,6 +941,11 @@ VkResult Device::Create( auto gpuMemoryEventHandler = pInstance->GetGpuMemoryEventHandler(); Device* pDevice = ApiDevice::ObjectFromHandle(reinterpret_cast(pDispatchableDevice)); + if (deviceFeatures.gpuMemoryEventHandler) + { + gpuMemoryEventHandler->EnableGpuMemoryEvents(pDevice); + } + for (auto iter = deviceMemoryReportCallbacks.Begin(); iter.IsValid(); iter.Next()) { iter.Get().pDevice = pDevice; @@ -997,12 +1025,6 @@ VkResult Device::Create( return vkResult; } -// ===================================================================================================================== - -// ==================================================================================================================== = - -// ===================================================================================================================== - #if VKI_RAY_TRACING // ===================================================================================================================== VkResult Device::CreateRayTraceState() @@ -1685,14 +1707,16 @@ VkResult Device::Destroy(const VkAllocationCallbacks* pAllocator) m_renderStateCache.Destroy(); - const bool deviceMemoryReportEnabled = m_enabledFeatures.deviceMemoryReport; - Util::Destructor(this); - if (deviceMemoryReportEnabled == true) + if (m_enabledFeatures.deviceMemoryReport) { VkInstance()->GetGpuMemoryEventHandler()->UnregisterDeviceMemoryReportCallbacks(this); - VkInstance()->GetGpuMemoryEventHandler()->DisableGpuMemoryEvents(); + } + + if (m_enabledFeatures.gpuMemoryEventHandler) + { + VkInstance()->GetGpuMemoryEventHandler()->DisableGpuMemoryEvents(this); } FreeApiObject(VkInstance()->GetAllocCallbacks(), ApiDevice::FromObject(this)); @@ -1744,7 +1768,7 @@ VkResult Device::CreateInternalComputePipeline( const uint8_t* pCode, uint32_t numUserDataNodes, Vkgc::ResourceMappingRootNode* pUserDataNodes, - VkShaderModuleCreateFlags flags, + VkShaderModuleCreateFlags internalShaderFlags, bool forceWave64, const VkSpecializationInfo* pSpecializationInfo, InternalPipeline* pInternalPipeline) @@ -1775,7 +1799,8 @@ VkResult Device::CreateInternalComputePipeline( // Build shader module result = pCompiler->BuildShaderModule( this, - flags, + 0, + internalShaderFlags, codeByteSize, pCode, false, @@ -1927,7 +1952,7 @@ VkResult Device::CreateInternalComputePipeline( } memcpy(pInternalPipeline->pPipeline, pPipeline, sizeof(pPipeline)); - if (GetEnabledFeatures().deviceMemoryReport == true) + if (GetEnabledFeatures().gpuMemoryEventHandler) { size_t numEntries = 0; Util::Vector palSubAllocInfos(VkInstance()->Allocator()); @@ -1943,6 +1968,7 @@ VkResult Device::CreateInternalComputePipeline( // Report the Pal suballocation for this pipeline to device_memory_report // Internal pipelines are attributed to the device VkInstance()->GetGpuMemoryEventHandler()->ReportDeferredPalSubAlloc( + this, palSubAllocInfos[i].address, palSubAllocInfos[i].offset, DispatchableDevice::IntValueFromHandle(DispatchableDevice::FromObject(this)), @@ -2203,23 +2229,21 @@ void Device::GetQueue2( uint32 queueCount = VkPhysicalDevice(DefaultDeviceIndex)->GetQueueFamilyProperties(queueFamilyIndex).queueCount; + // Queues with flags will be indexed separately to queues without flags + // Consider only those queues with matching flags + uint32 testIndex = 0; + for (uint32 i = 0; i < queueCount; i++) { - // Queues with flags will be indexed separately to queues without flags - // Consider only those queues with matching flags - uint32 testIndex = 0; - for (uint32 i = 0; i < queueCount; i++) - { - DispatchableQueue* pFoundQueue = m_pQueues[queueFamilyIndex][i]; + DispatchableQueue* pFoundQueue = m_pQueues[queueFamilyIndex][i]; - if ((pFoundQueue != nullptr) && ((*pFoundQueue)->GetFlags() == flags)) + if ((pFoundQueue != nullptr) && ((*pFoundQueue)->GetFlags() == flags)) + { + if (testIndex == queueIndex) { - if (testIndex == queueIndex) - { - *pQueue = reinterpret_cast(pFoundQueue); - break; - } - testIndex++; + *pQueue = reinterpret_cast(pFoundQueue); + break; } + testIndex++; } } } @@ -2237,8 +2261,6 @@ Pal::PrtFeatureFlags Device::GetPrtFeatures() const return featureFlags; } -// ===================================================================================================================== - // ===================================================================================================================== VkResult Device::WaitForFences( uint32_t fenceCount, @@ -4835,7 +4857,6 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetRayTracingCaptureReplayShaderGroupHandlesKHR // replaying and we will make use of them in vkCreateRayTracingPipelinesKHR. RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pipeline); - // #raytracing: MGPU support - Return based on DefaultDeviceIndex since the result shouldn't vary between GPUs. pPipeline->GetRayTracingShaderGroupHandles(DefaultDeviceIndex, firstGroup, groupCount, dataSize, pData); return VK_SUCCESS; diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index 2400b9bc..cf59ef01 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -806,6 +806,7 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkGetShaderModuleIdentifierEXT ); INIT_DISPATCH_ENTRY(vkGetShaderModuleCreateInfoIdentifierEXT ); + } // ===================================================================================================================== diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 276a42ca..c4792a75 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -448,6 +448,7 @@ VkResult GraphicsPipeline::CreatePipelineObjects( (result == VK_SUCCESS)) { pBinaryInfo = PipelineBinaryInfo::Create( + pCacheIds[DefaultDeviceIndex], pPipelineBinarySizes[DefaultDeviceIndex], pPipelineBinaries[DefaultDeviceIndex], pAllocator); @@ -582,6 +583,8 @@ VkResult GraphicsPipeline::Create( Util::MetroHash::Hash elfHash = {}; BuildApiHash(pCreateInfo, &apiPsoHash, &elfHash); + binaryCreateInfo.apiPsoHash = apiPsoHash; + // 4. Get pipeline layout VK_ASSERT(pCreateInfo->layout != VK_NULL_HANDLE); PipelineLayout* pPipelineLayout = PipelineLayout::ObjectFromHandle(pCreateInfo->layout); @@ -699,6 +702,8 @@ VkResult GraphicsPipeline::Create( if (result == VK_SUCCESS) { + const Device::DeviceFeatures& deviceFeatures = pDevice->GetEnabledFeatures(); + uint64_t durationTicks = Util::GetPerfCpuTime() - startTimeTicks; uint64_t duration = vk::utils::TicksToNano(durationTicks); binaryCreateInfo.pipelineFeedback.feedbackValid = true; @@ -710,7 +715,7 @@ VkResult GraphicsPipeline::Create( &binaryCreateInfo.pipelineFeedback, binaryCreateInfo.stageFeedback); - if (pDevice->GetEnabledFeatures().deviceMemoryReport == true) + if (deviceFeatures.gpuMemoryEventHandler) { size_t numEntries = 0; Util::Vector palSubAllocInfos(pDevice->VkInstance()->Allocator()); @@ -726,6 +731,7 @@ VkResult GraphicsPipeline::Create( { // Report the Pal suballocation for this pipeline to device_memory_report pDevice->VkInstance()->GetGpuMemoryEventHandler()->ReportDeferredPalSubAlloc( + pDevice, palSubAllocInfos[i].address, palSubAllocInfos[i].offset, GraphicsPipeline::IntValueFromHandle(*pPipeline), @@ -1191,6 +1197,11 @@ GraphicsPipeline::GraphicsPipeline( m_info.graphicsShaderInfos.dynamicState.enable.dualSourceBlendEnable = 1; } + if (ContainsDynamicState(DynamicStatesInternal::VertexInput)) + { + m_info.graphicsShaderInfos.dynamicState.enable.vertexBufferCount = 1; + } + pPalPipelineHasher->Update(m_palPipelineHash); pPalPipelineHasher->Finalize(reinterpret_cast(&m_palPipelineHash)); } diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp index e97b567d..7e7d1ded 100644 --- a/icd/api/vk_graphics_pipeline_library.cpp +++ b/icd/api/vk_graphics_pipeline_library.cpp @@ -308,6 +308,7 @@ static GraphicsPipelineBinaryCreateInfo* DumpGraphicsPipelineBinaryCreateInfo( // ===================================================================================================================== VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( const Device* pDevice, + PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, const GraphicsPipelineLibraryInfo* pLibInfo, const GraphicsPipelineShaderStageInfo* pShaderStageInfo, @@ -317,7 +318,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( { VkResult result = VK_SUCCESS; PipelineCompiler* pCompiler = pDevice->GetCompiler(DefaultDeviceIndex); - uint32_t dynamicStateFlags = GetDynamicStateFlags(pCreateInfo->pDynamicState, pLibInfo); + uint64_t dynamicStateFlags = GetDynamicStateFlags(pCreateInfo->pDynamicState, pLibInfo); // Pipeline info only includes the shaders that match the enabled VkGraphicsPipelineLibraryFlagBitsEXT. // Use this information to skip the compilation of unused shader modules. @@ -348,7 +349,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( { // We don't take care of the result. Early compile failure in some cases is expected result = pCompiler->CreateGraphicsShaderBinary( - pDevice, pShaderStageInfo->stages[i].stage, pBinaryCreateInfo, &pTempModules[i]); + pDevice, pPipelineCache, pShaderStageInfo->stages[i].stage, pBinaryCreateInfo, &pTempModules[i]); } pTempModuleStages[i].stage = pShaderStageInfo->stages[i].stage; @@ -375,7 +376,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( pTempModules[TempIdx] = *pParentHandle; result = pCompiler->CreateGraphicsShaderBinary( - pDevice, ShaderStage::ShaderStageVertex, pBinaryCreateInfo, &pTempModules[TempIdx]); + pDevice, pPipelineCache, ShaderStage::ShaderStageVertex, pBinaryCreateInfo, &pTempModules[TempIdx]); pTempModuleStages[TempIdx].stage = ShaderStage::ShaderStageVertex; pTempModuleStages[TempIdx].freeBinaryOnly = true; @@ -396,8 +397,8 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( pTempModules[TempIdx] = *pParentHandle; - result = pCompiler->CreateGraphicsShaderBinary( - pDevice, ShaderStage::ShaderStageFragment, pBinaryCreateInfo, &pTempModules[TempIdx]); + result = pCompiler->CreateGraphicsShaderBinary(pDevice, pPipelineCache, + ShaderStage::ShaderStageFragment, pBinaryCreateInfo, &pTempModules[TempIdx]); pTempModuleStages[TempIdx].stage = ShaderStage::ShaderStageFragment; pTempModuleStages[TempIdx].freeBinaryOnly = true; @@ -407,12 +408,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; ++i) { - if (pCompiler->IsValidShaderModule(&pTempModules[i])) - { - PipelineCompiler::SetPartialGraphicsPipelineBinaryInfo( - &pTempModules[i], pTempModuleStages[i].stage, pBinaryCreateInfo); - } - else + if (pCompiler->IsValidShaderModule(&pTempModules[i]) == false) { pTempModuleStages[i].stage = ShaderStage::ShaderStageInvalid; } @@ -443,6 +439,8 @@ VkResult GraphicsPipelineLibrary::Create( ShaderModuleHandle tempModules[ShaderStage::ShaderStageGfxCount] = {}; TempModuleState tempModuleStates[ShaderStage::ShaderStageGfxCount] = {}; + binaryCreateInfo.pipelineInfo.iaState.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + // 1. Build shader stage infos if (result == VK_SUCCESS) { @@ -481,6 +479,7 @@ VkResult GraphicsPipelineLibrary::Create( uint64_t apiPsoHash = {}; Util::MetroHash::Hash elfHash = {}; BuildApiHash(pCreateInfo, &apiPsoHash, &elfHash); + binaryCreateInfo.apiPsoHash = apiPsoHash; // 4. Get pipeline layout const PipelineLayout* pPipelineLayout = PipelineLayout::ObjectFromHandle(pCreateInfo->layout); @@ -510,6 +509,7 @@ VkResult GraphicsPipelineLibrary::Create( // 6. Create partial pipeline binary for fast-link result = CreatePartialPipelineBinary( pDevice, + pPipelineCache, pCreateInfo, &libInfo, &shaderStageInfo, diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp index 06d144f7..c8a4635d 100644 --- a/icd/api/vk_image.cpp +++ b/icd/api/vk_image.cpp @@ -1789,6 +1789,9 @@ void Image::CalculateMemoryRequirementsInternal( ~pDevice->GetMemoryTypeMaskMatching(VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD); } + // Images are not using memoryType for DescriptorBuffers + pMemoryRequirements->memoryTypeBits &= ~pDevice->GetMemoryTypeMaskForDescriptorBuffers(); + // Add an extra memory padding. This can be enabled while capturing GFXR traces and disabled later. Capturing with // this setting enabled helps in replaying GFXR traces. When this setting is not used while capture, GFXR might // return a fatal error while replaying with different DCC threshold values. This is caused because gfxreconstruct diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp index 0cf27269..d7e17eab 100644 --- a/icd/api/vk_instance.cpp +++ b/icd/api/vk_instance.cpp @@ -403,6 +403,8 @@ VkResult Instance::Init( if (status == VK_SUCCESS) { + Pal::IPlatform::InstallDeveloperCb(m_pPalPlatform, &Instance::PalDeveloperCallback, this); + // Get the platform property. Vulkan doesn't use it so far. Pal::PlatformProperties platformProps; @@ -477,13 +479,6 @@ VkResult Instance::Init( } } - // Install PAL developer callback if the SQTT layer is enabled. This is required to trap internal barriers - // and dispatches performed by PAL so that they can be correctly annotated to RGP. - if (status == VK_SUCCESS) - { - Pal::IPlatform::InstallDeveloperCb(m_pPalPlatform, &Instance::PalDeveloperCallback, this); - } - if (status == VK_SUCCESS) { size_t screenSize = m_pPalPlatform->GetScreenObjectSize(); @@ -567,6 +562,10 @@ VkResult Instance::Init( InitDispatchTable(); #if DEBUG + // Optionally wait for a debugger to be attached + utils::WaitIdleForDebugger(pPhysicalDevice->GetRuntimeSettings().waitForDebugger, + &pPhysicalDevice->GetRuntimeSettings().waitForDebuggerExecutableName[0], + pPhysicalDevice->GetRuntimeSettings().debugTimeout); #endif } @@ -1157,6 +1156,8 @@ void PAL_STDCALL Instance::PalDeveloperCallback( if (pInstance->IsTracingSupportEnabled()) { + // This is required to trap internal barriers and dispatches performed by PAL so that they can be correctly + // annotated to RGP. SqttMgr::PalDeveloperCallback(pInstance, deviceIndex, type, pCbData); } diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp index 1c511d04..a713a1d9 100644 --- a/icd/api/vk_memory.cpp +++ b/icd/api/vk_memory.cpp @@ -366,6 +366,8 @@ VkResult Memory::Create( } } + const Device::DeviceFeatures& deviceFeatures = pDevice->GetEnabledFeatures(); + if (vkResult == VK_SUCCESS) { // Account for committed size in logical device. The destructor will decrease the counter accordingly. @@ -399,9 +401,10 @@ VkResult Memory::Create( if (pPalGpuMem != nullptr) { - if (pDevice->GetEnabledFeatures().deviceMemoryReport == true) + if (deviceFeatures.gpuMemoryEventHandler) { pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanAllocateEvent( + pDevice, pPalGpuMem, Memory::IntValueFromHandle(*pMemoryHandle), VK_OBJECT_TYPE_DEVICE_MEMORY, @@ -431,9 +434,10 @@ VkResult Memory::Create( } else { - if (pDevice->GetEnabledFeatures().deviceMemoryReport == true) + if (deviceFeatures.deviceMemoryReport) { pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanAllocationFailedEvent( + pDevice, pAllocInfo->allocationSize, VK_OBJECT_TYPE_DEVICE_MEMORY, pAllocInfo->memoryTypeIndex); @@ -976,11 +980,6 @@ void Memory::Free( Pal::IGpuMemory* pGpuMemory = m_pPalMemory[i][i]; if (pGpuMemory != nullptr) { - if (pDevice->GetEnabledFeatures().deviceMemoryReport == true) - { - pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanFreeEvent(pGpuMemory); - } - Pal::IDevice* pPalDevice = pDevice->PalDevice(i); pDevice->RemoveMemReference(pPalDevice, pGpuMemory); diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index a77e55ff..49a2b7ba 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -297,6 +297,31 @@ static bool VerifyAstcLdrFormatSupport( VerifyFormatSupport(dev, VK_FORMAT_ASTC_12x12_SRGB_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); return astcLdrSupport; } +// ===================================================================================================================== +// Returns true if the given physical device supports the minimum required compressed texture formats to report ASTC-HDR +// support +static VkBool32 VerifyAstcHdrFormatSupport( + const PhysicalDevice& dev) +{ + // Based on vulkan spec Table 68. ASTC HDR compressed formats with VkImageType + // VK_IMAGE_TYPE_2D + const VkBool32 astcHdrSupport = + VerifyFormatSupport(dev, VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_6x5_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_6x6_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_8x5_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_8x6_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_8x8_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x5_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x6_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x8_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x10_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_12x10_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) && + VerifyFormatSupport(dev, VK_FORMAT_ASTC_12x12_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + return astcHdrSupport; +} // ===================================================================================================================== // Returns true if the given physical device supports the minimum required BC compressed texture format @@ -633,7 +658,7 @@ static void GenerateCacheUuid( uint32 vulkanIcdVersion; uint32 palInterfaceVersion; uint32 osHash; - uint32 buildTimeHas; + uint32 buildTimeHash; } cacheVersionInfo = { Util::HashLiteralString("pipelineCache"), @@ -1010,7 +1035,9 @@ VkResult PhysicalDevice::Initialize() m_memoryTypeMaskForExternalSharing = m_memoryTypeMask; - if (result == Pal::Result::Success) + VkResult vkResult = PalToVkResult(result); + + if (vkResult == VK_SUCCESS) { // Determine if EQAA is supported by checking if, for each MSAA fragment count, all sample combos are okay. const auto& imgProps = PalProperties().imageProperties; @@ -1033,26 +1060,15 @@ VkResult PhysicalDevice::Initialize() m_eqaaSupported &= Util::TestAllFlagsSet(imgProps.msaaSupport, Pal::MsaaFlags::MsaaAllF1); break; } - } - // Generate our cache UUID. - // This can be use later as a "namespace" for Uuid3()/Uuid5() calls for individual pipelines - if (result == Pal::Result::Success) - { + // Generate our cache UUID. + // This can be use later as a "namespace" for Uuid3()/Uuid5() calls for individual pipelines GenerateCacheUuid(settings, PalProperties(), m_appProfile, &m_pipelineCacheUUID); - } - // Collect properties for perf experiments (this call can fail; we just don't report support for - // perf measurement extension then) - if (result == Pal::Result::Success) - { + // Collect properties for perf experiments (this call can fail; we just don't report support for + // perf measurement extension then) PopulateGpaProperties(); - } - VkResult vkResult = PalToVkResult(result); - - if (vkResult == VK_SUCCESS) - { InitializePlatformKey(settings); vkResult = m_compiler.Initialize(); } @@ -2505,72 +2521,48 @@ void PhysicalDevice::PopulateLimits() // Maximum number of components of output variables which may be output by a vertex shader. m_limits.maxVertexOutputComponents = 128; - // OGL: SI_MAX_VP_VARYING_COMPONENTS - // Maximum tessellation generation level supported by the fixed function tessellation primitive generator. m_limits.maxTessellationGenerationLevel = 64; - // OGL: SI_MAX_TESS_FACTOR - // Maximum patch size, in vertices, of patches that can be processed by the tessellation primitive generator. // This is specified by the patchControlPoints of the VkPipelineTessellationStateCreateInfo structure. m_limits.maxTessellationPatchSize = 32; - // OGL: pHpCaps->maxVertexCountPerPatch = SI_MAX_VERTEX_COUNT_PER_PATCH; - // Maximum number of components of input variables which may be provided as per-vertex inputs to the tessellation // control shader stage. m_limits.maxTessellationControlPerVertexInputComponents = 128; - // OGL: pHpCaps->maxTessControlInputComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS; - // Maximum number of components of per-vertex output variables which may be output from the tessellation control // shader stage. m_limits.maxTessellationControlPerVertexOutputComponents = 128; - // OGL: pHpCaps->maxHullVaryingComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS; - // Maximum number of components of per-patch output variables which may be output from the tessellation control // shader stage. m_limits.maxTessellationControlPerPatchOutputComponents = 120; - // OGL: pHpCaps->maxTessControlPatchComponents = SI_MAX_TESS_CONTROL_PATCH_COMPONENTS; - // Maximum total number of components of per-vertex and per-patch output variables which may be output from the // tessellation control shader stage. (The total number of components of active per-vertex and per-patch outputs is // derived by multiplying the per-vertex output component count by the output patch size and then adding the // per-patch output component count. The total component count may not exceed this limit.) m_limits.maxTessellationControlTotalOutputComponents = 4096; - // OGL: pHpCaps->maxTessControlTotalOutputComponents = SI_MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS; - // Maximum number of components of input variables which may be provided as per-vertex inputs to the tessellation // evaluation shader stage. m_limits.maxTessellationEvaluationInputComponents = 128; - // OGL: pDpCaps->maxTessEvaluationInputComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS [sic] - // Maximum number of components of per-vertex output variables which may be output from the tessellation evaluation // shader stage m_limits.maxTessellationEvaluationOutputComponents = 128; - // OGL: pDpCaps->maxDomainVaryingComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS [sic] - // Maximum invocation count (per input primitive) supported for an instanced geometry shader. m_limits.maxGeometryShaderInvocations = palProps.gfxipProperties.maxGsInvocations; - // OGL: pGpCaps->maxGeometryInvocations = SI_MAX_GP_INVOCATIONS - // Maximum number of components of input variables which may be provided as inputs to the geometry shader stage m_limits.maxGeometryInputComponents = 128; - // OGL: pGpCaps->maxGeometryVaryingComponents = SI_MAX_GP_VARYING_COMPONENTS - // Maximum number of components of output variables which may be output from the geometry shader stage. m_limits.maxGeometryOutputComponents = 128; - // OGL: pGpCaps->maxGeometryVaryingComponents = SI_MAX_GP_VARYING_COMPONENTS; (NOTE: Not a separate cap) - // Maximum number of vertices which may be emitted by any geometry shader. m_limits.maxGeometryOutputVertices = palProps.gfxipProperties.maxGsOutputVert; @@ -2581,8 +2573,6 @@ void PhysicalDevice::PopulateLimits() // Maximum number of components of input variables which may be provided as inputs to the fragment shader stage. m_limits.maxFragmentInputComponents = 128; - // OGL: pFpCaps->maxFragmentInputComponents = SI_MAX_VP_VARYING_COMPONENTS; - // Maximum number of output attachments which may be written to by the fragment shader stage. m_limits.maxFragmentOutputAttachments = Pal::MaxColorTargets; @@ -2590,8 +2580,6 @@ void PhysicalDevice::PopulateLimits() // enabled and one of the dual source blend modes is in use. m_limits.maxFragmentDualSrcAttachments = 1; - // OGL: pCaps->buf.maxDualSourceDrawBuf = SI_MAX_DUAL_SOURCE_COLOR_BUFFERS; - // NOTE: This could be num_cbs / 2 = 4. When dual source blending is on, two source colors are written per // attachment and to facilitate this the HW operates such that the odd-numbered CBs do not get used. OGL still // reports only 1 dual source attachment though, and I think DX API spec locks you into a single dual source @@ -2616,8 +2604,6 @@ void PhysicalDevice::PopulateLimits() m_limits.maxComputeWorkGroupCount[1] = 65535; m_limits.maxComputeWorkGroupCount[2] = 65535; - // OGL: pCpCaps->maxComputeWorkGroupCount[i] = SI_MAX_WORK_GROUP_COUNT; - const uint32_t clampedMaxThreads = Util::Min(palProps.gfxipProperties.maxThreadGroupSize, palProps.gfxipProperties.maxAsyncComputeThreadGroupSize); @@ -3827,6 +3813,9 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_QUEUE_FAMILY_FOREIGN)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_DESCRIPTOR_INDEXING)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(VALVE_MUTABLE_DESCRIPTOR_TYPE)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MUTABLE_DESCRIPTOR_TYPE)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_VARIABLE_POINTERS)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_VERTEX_ATTRIBUTE_DIVISOR)); @@ -4107,6 +4096,12 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_PHYSICAL_DEVICE_DRM)); #endif + if ((pPhysicalDevice == nullptr) || + VerifyAstcHdrFormatSupport(*pPhysicalDevice)) + { + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_TEXTURE_COMPRESSION_ASTC_HDR)); + } + return availableExtensions; } @@ -5940,7 +5935,7 @@ size_t PhysicalDevice::GetFeatures2( pExtInfo->subgroupSizeControl = VK_TRUE; pExtInfo->computeFullSubgroups = VK_TRUE; pExtInfo->synchronization2 = VK_TRUE; - pExtInfo->textureCompressionASTC_HDR = VK_FALSE; + pExtInfo->textureCompressionASTC_HDR = VerifyAstcHdrFormatSupport(*this); pExtInfo->shaderZeroInitializeWorkgroupMemory = VK_TRUE; pExtInfo->dynamicRendering = VK_TRUE; pExtInfo->shaderIntegerDotProduct = VK_TRUE; @@ -6551,7 +6546,7 @@ size_t PhysicalDevice::GetFeatures2( if (updateFeatures) { - pExtInfo->textureCompressionASTC_HDR = VK_FALSE; + pExtInfo->textureCompressionASTC_HDR = VerifyAstcHdrFormatSupport(*this); } structSize = sizeof(*pExtInfo); @@ -6593,6 +6588,19 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ADDRESS_BINDING_REPORT_FEATURES_EXT: + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->reportAddressBinding = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FAULT_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp index cce9e3e0..5e2be2f2 100644 --- a/icd/api/vk_pipeline.cpp +++ b/icd/api/vk_pipeline.cpp @@ -353,6 +353,7 @@ VkResult Pipeline::BuildShaderStageInfo( result = pCompiler->BuildShaderModule( pDevice, flags, + 0, codeSize, pCode, adaptForFastLink, @@ -699,6 +700,7 @@ VkResult Pipeline::GetShaderDisassembly( // ===================================================================================================================== PipelineBinaryInfo* PipelineBinaryInfo::Create( + Util::MetroHash::Hash hash, size_t size, const void* pBinary, const VkAllocationCallbacks* pAllocator) @@ -717,9 +719,9 @@ PipelineBinaryInfo* PipelineBinaryInfo::Create( { pInfo = VK_PLACEMENT_NEW(pStorage) PipelineBinaryInfo(); + pInfo->binaryHash = hash; pInfo->binaryByteSize = size; pInfo->pBinary = Util::VoidPtrInc(pStorage, sizeof(PipelineBinaryInfo)); - memcpy(pInfo->pBinary, pBinary, size); } } @@ -887,11 +889,12 @@ void Pipeline::ElfHashToCacheId( hasher.Update(pDevice->GetEnabledFeatures().nullDescriptorExtended); #if VKI_RAY_TRACING - // The AccelStructTracker enable status gets stored inside the ELF within - // the static GpuRT flags. Needed for both TraceRay() and RayQuery(). if (pDevice->RayTrace() != nullptr) { + // The accel struct tracker enable and the trace ray counter states get stored inside the ELF within + // the static GpuRT flags. Needed for both TraceRay() and RayQuery(). hasher.Update(pDevice->RayTrace()->AccelStructTrackerEnabled(deviceIdx)); + hasher.Update(pDevice->RayTrace()->TraceRayCounterMode(deviceIdx)); } #endif diff --git a/icd/api/vk_pipeline_cache.cpp b/icd/api/vk_pipeline_cache.cpp index 031ac318..0f9f38e6 100644 --- a/icd/api/vk_pipeline_cache.cpp +++ b/icd/api/vk_pipeline_cache.cpp @@ -39,28 +39,17 @@ namespace vk // ===================================================================================================================== PipelineCache::PipelineCache( const Device* pDevice, - ShaderCache* pShaderCaches, PipelineBinaryCache* pBinaryCache ) : m_pDevice(pDevice), - m_shaderCaches{}, m_pBinaryCache(pBinaryCache) { - for (uint32_t i = 0; i < pDevice->NumPalDevices(); ++i) - { - const auto& cache = pShaderCaches[i]; - m_shaderCaches[i].Init(cache.GetCacheType(), cache.GetCachePtr()); - } } // ===================================================================================================================== PipelineCache::~PipelineCache() { - for (uint32_t i = 0; i < m_pDevice->NumPalDevices(); i++) - { - m_shaderCaches[i].Destroy(m_pDevice->GetCompiler(i)); - } } // ===================================================================================================================== @@ -73,20 +62,9 @@ VkResult PipelineCache::Create( VkResult result = VK_SUCCESS; const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); uint32_t numPalDevices = pDevice->NumPalDevices(); - bool useInitialData = false; - size_t shaderCacheSize = 0; - size_t pipelineCacheSize[MaxPalDevices]; bool usePipelineCacheInitialData = false; - PipelineCompilerType cacheType = pDevice->GetCompiler(DefaultDeviceIndex)->GetShaderCacheType(); - - for (uint32_t i = 0; i < numPalDevices; i++) - { - pipelineCacheSize[i] = pDevice->GetCompiler(DefaultDeviceIndex)->GetShaderCacheSize(cacheType); - shaderCacheSize += pipelineCacheSize[i]; - } - if ((pCreateInfo->initialDataSize > 0) && settings.usePipelineCacheInitialData) { const PipelineCacheHeaderData* pHeader = static_cast(pCreateInfo->pInitialData); @@ -113,21 +91,13 @@ VkResult PipelineCache::Create( { usePipelineCacheInitialData = true; } - else - { - auto pPrivateDataHeader = reinterpret_cast(pData); - if (pPrivateDataHeader->cacheType == cacheType) - { - useInitialData = true; - } - } } } } } // Allocate system memory for all objects - const size_t objSize = sizeof(PipelineCache) + shaderCacheSize; + const size_t objSize = sizeof(PipelineCache); void* pMemory = pDevice->AllocApiObject(pAllocator, objSize); if (pMemory == nullptr) @@ -136,102 +106,40 @@ VkResult PipelineCache::Create( } else { - const PipelineCachePrivateHeaderData* pPrivateDataHeader = nullptr; - const void* pBlobs[MaxPalDevices] = {}; - - if (useInitialData) - { - pPrivateDataHeader = reinterpret_cast( - Util::VoidPtrInc(pCreateInfo->pInitialData, sizeof(PipelineCacheHeaderData))); - - pBlobs[0] = Util::VoidPtrInc(pPrivateDataHeader, sizeof(PipelineCachePrivateHeaderData)); - for (uint32_t i = 1; i < numPalDevices; i++) - { - pBlobs[i] = Util::VoidPtrInc(pBlobs[i - 1], static_cast(pPrivateDataHeader->blobSize[i - 1])); - } - } - - ShaderCache shaderCaches[MaxPalDevices]; - size_t shaderCacheOffset = sizeof(PipelineCache); uint32_t expectedEntries = pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPipelineCacheExpectedEntryCount(); - for (uint32_t i = 0; i < numPalDevices; i++) + PipelineBinaryCache* pBinaryCache = nullptr; + if (settings.allowExternalPipelineCacheObject) { const void* pInitialData = nullptr; size_t initialDataSize = 0; - if (useInitialData) + if (usePipelineCacheInitialData) { - pInitialData = pBlobs[i]; - initialDataSize = static_cast(pPrivateDataHeader->blobSize[i]); + pInitialData = Util::VoidPtrInc(pCreateInfo->pInitialData, sizeof(PipelineCacheHeaderData)); + initialDataSize = pCreateInfo->initialDataSize - sizeof(PipelineCacheHeaderData); } - if (result == VK_SUCCESS) - { - result = pDevice->GetCompiler(DefaultDeviceIndex)->CreateShaderCache( - pInitialData, - initialDataSize, - expectedEntries, - Util::VoidPtrInc(pMemory, shaderCacheOffset), - &shaderCaches[i]); - } - else - { - break; - } - - // Move to next shader cache object - shaderCacheOffset += pipelineCacheSize[i]; - } - - // Something went wrong with creating the PAL object. Free memory - if (result != VK_SUCCESS) - { - for (uint32_t i = 0; i < numPalDevices; i++) - { - shaderCaches[i].Destroy(pDevice->GetCompiler(i)); - } - } - - if (result == VK_SUCCESS) - { - PipelineBinaryCache* pBinaryCache = nullptr; - if (settings.allowExternalPipelineCacheObject) - { - const void* pInitialData = nullptr; - size_t initialDataSize = 0; - - if (usePipelineCacheInitialData) - { - pInitialData = Util::VoidPtrInc(pCreateInfo->pInitialData, sizeof(PipelineCacheHeaderData)); - initialDataSize = pCreateInfo->initialDataSize - sizeof(PipelineCacheHeaderData); - } - - vk::PhysicalDevice* pDefaultPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex); - pBinaryCache = PipelineBinaryCache::Create( - pDefaultPhysicalDevice->VkInstance()->GetAllocCallbacks(), - pDefaultPhysicalDevice->GetPlatformKey(), - pDevice->GetCompiler(DefaultDeviceIndex)->GetGfxIp(), - pDefaultPhysicalDevice->GetRuntimeSettings(), - pDefaultPhysicalDevice->PalDevice()->GetCacheFilePath(), + vk::PhysicalDevice* pDefaultPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex); + pBinaryCache = PipelineBinaryCache::Create( + pDefaultPhysicalDevice->VkInstance()->GetAllocCallbacks(), + pDefaultPhysicalDevice->GetPlatformKey(), + pDevice->GetCompiler(DefaultDeviceIndex)->GetGfxIp(), + pDefaultPhysicalDevice->GetRuntimeSettings(), + pDefaultPhysicalDevice->PalDevice()->GetCacheFilePath(), #if ICD_GPUOPEN_DEVMODE_BUILD - pDefaultPhysicalDevice->VkInstance()->GetDevModeMgr(), + pDefaultPhysicalDevice->VkInstance()->GetDevModeMgr(), #endif - expectedEntries, - initialDataSize, - pInitialData, - false); + expectedEntries, + initialDataSize, + pInitialData, + false); - // This isn't a terminal failure, the device can continue without the pipeline cache if need be. - VK_ALERT(pBinaryCache == nullptr); - } - PipelineCache* pCache = VK_PLACEMENT_NEW(pMemory) PipelineCache(pDevice, shaderCaches, pBinaryCache); - *pPipelineCache = PipelineCache::HandleFromVoidPointer(pMemory); - } - else - { - pDevice->FreeApiObject(pAllocator, pMemory); + // This isn't a terminal failure, the device can continue without the pipeline cache if need be. + VK_ALERT(pBinaryCache == nullptr); } + PipelineCache* pCache = VK_PLACEMENT_NEW(pMemory) PipelineCache(pDevice, pBinaryCache); + *pPipelineCache = PipelineCache::HandleFromVoidPointer(pMemory); } return result; @@ -257,6 +165,7 @@ VkResult PipelineCache::Destroy( } // ===================================================================================================================== +// Stores AMD specific pipeline cache data to binary cache. VkResult PipelineCache::GetData( void* pData, size_t* pSize) @@ -271,43 +180,7 @@ VkResult PipelineCache::GetData( } else { - uint32_t numPalDevices = m_pDevice->NumPalDevices(); - - size_t allBlobSize = sizeof(PipelineCachePrivateHeaderData); - PipelineCachePrivateHeaderData headerData = {}; - - headerData.cacheType = m_shaderCaches[0].GetCacheType(); - for (uint32_t i = 0; i < numPalDevices; i++) - { - size_t blobSize = 0; - result = m_shaderCaches[i].Serialize(nullptr, &blobSize); - VK_ASSERT(result == VK_SUCCESS); - headerData.blobSize[i] = blobSize; - allBlobSize += blobSize; - } - - if (*pSize == 0) - { - *pSize = allBlobSize; - } - else - { - VK_ASSERT(*pSize >= allBlobSize); - memcpy(pData, &headerData, sizeof(headerData)); - - void* pBlob = Util::VoidPtrInc(pData, sizeof(headerData)); - - for (uint32_t i = 0; i < numPalDevices; i++) - { - size_t blobSize = static_cast(headerData.blobSize[i]); - result = m_shaderCaches[i].Serialize(pBlob, &blobSize); - if (result != VK_SUCCESS) - { - break; - } - pBlob = Util::VoidPtrInc(pBlob, blobSize); - } - } + *pSize = 0; } return result; @@ -333,34 +206,6 @@ VkResult PipelineCache::Merge( result = m_pBinaryCache->Merge(srcCacheCount, &binaryCaches[0]); } - else - { - Util::AutoBuffer shaderCaches( - srcCacheCount * m_pDevice->NumPalDevices(), - m_pDevice->VkInstance()->Allocator()); - - for (uint32_t deviceIdx = 0; deviceIdx < m_pDevice->NumPalDevices(); deviceIdx++) - { - for (uint32_t cacheIdx = 0; cacheIdx < srcCacheCount; cacheIdx++) - { - VK_ASSERT(ppSrcCaches[cacheIdx]->GetShaderCache(deviceIdx).GetCacheType() == - GetShaderCache(deviceIdx).GetCacheType()); - // Store all PAL caches like this d0c0,d0c1,d0c2...,d1c0,d1c2,d1c3... - shaderCaches[deviceIdx * srcCacheCount + cacheIdx] = - ppSrcCaches[cacheIdx]->GetShaderCache(deviceIdx).GetCachePtr(); - } - } - - for (uint32_t i = 0; i < m_pDevice->NumPalDevices(); i++) - { - result = m_shaderCaches[i].Merge(srcCacheCount, &shaderCaches[i * srcCacheCount]); - - if (result != VK_SUCCESS) - { - break; - } - } - } return result; } diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp index 5f7dae92..73da28a6 100644 --- a/icd/api/vk_pipeline_layout.cpp +++ b/icd/api/vk_pipeline_layout.cpp @@ -301,6 +301,40 @@ VkResult PipelineLayout::BuildCompactSchemeInfo( const uint32_t pushConstRegCount = pushConstantsSizeInBytes / sizeof(uint32_t); + uint32_t gfxReservedCount = 0; + // Reserve an user-data to store the VA of buffer for transform feedback. + if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)) + { + gfxReservedCount++; + } + + if (pDevice->GetRuntimeSettings().enableDebugPrintf) + { + gfxReservedCount++; + } + +#if VKI_RAY_TRACING + if (HasRayTracing(pIn)) + { + gfxReservedCount += (InternalConstBufferRegCount + MaxTraceRayUserDataRegCount); + } +#endif + + // the user data entries for uber-fetch shader const buffer + if (IsUberFetchShaderEnabled(pDevice) && + (settings.enableEarlyCompile == false)) + { + gfxReservedCount += InternalConstBufferRegCount; + } + + // Reseve PAL internal user data node for base vertex, base instance, draw id and lds_esgs_size. + gfxReservedCount += 4; + + const uint32_t gfxInlinePushDescriptorUserDataLimit = + (settings.gfxInlinePushDescriptorUserDataLimit > gfxReservedCount) ? + settings.gfxInlinePushDescriptorUserDataLimit - gfxReservedCount : + 0; + // Populate user data layouts for each descriptor set that is active pUserDataLayout->setBindingRegBase = pInfo->userDataRegCount; @@ -331,7 +365,8 @@ VkResult PipelineLayout::BuildCompactSchemeInfo( { uint32_t regCountSpillLimit = (setLayoutInfo.activeStageMask == VK_SHADER_STAGE_COMPUTE_BIT) ? settings.csInlinePushDescriptorUserDataLimit : - settings.gfxInlinePushDescriptorUserDataLimit; + gfxInlinePushDescriptorUserDataLimit; + uint32_t inlineRegCount = pInfo->userDataRegCount + setLayoutInfo.sta.dwSize + pushConstRegCount; diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp index 5fafa58b..4f29ea61 100644 --- a/icd/api/vk_query.cpp +++ b/icd/api/vk_query.cpp @@ -280,29 +280,38 @@ VkResult PalQueryPool::GetResults( const uint32_t numXfbQueryDataElems = availability ? 3 : 2; // Vulkan supports 32-bit unsigned integer values data of transform feedback query, but Pal supports 64-bit only. - // So the query data is stored into xfbQueryData first. - Util::AutoBuffer xfbQueryData(queryCount * numXfbQueryDataElems, - pDevice->VkInstance()->Allocator()); + // So the query data is stored into pXfbQueryData first. + uint64_t* pXfbQueryData = nullptr; if (m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT) { - pQueryData = &xfbQueryData[0]; - queryDataStride = sizeof(uint64_t) * numXfbQueryDataElems; - queryDataSize = sizeof(uint64_t) * numXfbQueryDataElems * queryCount; + queryDataStride = sizeof(uint64_t) * numXfbQueryDataElems; + queryDataSize = queryDataStride * queryCount; queryFlags |= VK_QUERY_RESULT_64_BIT; - } - Pal::Result palResult = m_pPalQueryPool[DefaultDeviceIndex]->GetResults( - VkToPalQueryResultFlags(queryFlags), - m_palQueryType, - startQuery, - queryCount, - m_internalMem.CpuAddr(DefaultDeviceIndex), - &queryDataSize, - pQueryData, - static_cast(queryDataStride)); + pXfbQueryData = static_cast(pDevice->VkInstance()->AllocMem( + queryDataSize, VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)); + if (pXfbQueryData == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + pQueryData = pXfbQueryData; + } - result = PalToVkResult(palResult); + if (result == VK_SUCCESS) + { + Pal::Result palResult = m_pPalQueryPool[DefaultDeviceIndex]->GetResults( + VkToPalQueryResultFlags(queryFlags), + m_palQueryType, + startQuery, + queryCount, + m_internalMem.CpuAddr(DefaultDeviceIndex), + &queryDataSize, + pQueryData, + static_cast(queryDataStride)); + + result = PalToVkResult(palResult); + } if ((m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT) && ((result == VK_SUCCESS) || (result == VK_NOT_READY))) @@ -311,7 +320,7 @@ VkResult PalQueryPool::GetResults( for (size_t i = 0; i < queryCount; i++) { - uint64_t* pXfbQueryData = static_cast(&xfbQueryData[i * numXfbQueryDataElems]); + uint64_t* pXfbQueryElem = &pXfbQueryData[i * numXfbQueryDataElems]; // The number of written primitives and the number of needed primitives are in reverse order in Pal. if ((flags & VK_QUERY_RESULT_64_BIT) == 0) @@ -320,14 +329,14 @@ VkResult PalQueryPool::GetResults( if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) { - pPrimitivesCount[0] = static_cast(pXfbQueryData[1]); - pPrimitivesCount[1] = static_cast(pXfbQueryData[0]); + pPrimitivesCount[0] = static_cast(pXfbQueryElem[1]); + pPrimitivesCount[1] = static_cast(pXfbQueryElem[0]); } if (availability) { // Set the availability state to the last slot. - pPrimitivesCount[2] = static_cast(pXfbQueryData[2]); + pPrimitivesCount[2] = static_cast(pXfbQueryElem[2]); } } else @@ -336,19 +345,21 @@ VkResult PalQueryPool::GetResults( if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) { - pPrimitivesCount[0] = pXfbQueryData[1]; - pPrimitivesCount[1] = pXfbQueryData[0]; + pPrimitivesCount[0] = pXfbQueryElem[1]; + pPrimitivesCount[1] = pXfbQueryElem[0]; } if (availability) { // Set the availability state to the last slot. - pPrimitivesCount[2] = pXfbQueryData[2]; + pPrimitivesCount[2] = pXfbQueryElem[2]; } } pData = Util::VoidPtrInc(pData, static_cast(stride)); } + + pDevice->VkInstance()->FreeMem(pXfbQueryData); } } diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp index fa1f4542..c07b09b1 100644 --- a/icd/api/vk_queue.cpp +++ b/icd/api/vk_queue.cpp @@ -1596,8 +1596,6 @@ VkResult Queue::Submit( return result; } -// ===================================================================================================================== - // ===================================================================================================================== // Wait for a queue to go idle VkResult Queue::WaitIdle(void) diff --git a/icd/api/vk_shader.cpp b/icd/api/vk_shader.cpp index c108d16e..db859aca 100644 --- a/icd/api/vk_shader.cpp +++ b/icd/api/vk_shader.cpp @@ -200,7 +200,7 @@ VkResult ShaderModule::Init(const Device* pDevice, VkShaderModuleCreateFlags fla PipelineCompiler* pCompiler = pDevice->GetCompiler(DefaultDeviceIndex); VkResult result = pCompiler->BuildShaderModule( - pDevice, flags, m_codeSize, m_pCode, false, false, nullptr, nullptr, &m_handle); + pDevice, flags, 0, m_codeSize, m_pCode, false, false, nullptr, nullptr, &m_handle); if (result == VK_SUCCESS) { diff --git a/icd/api/vk_utils.cpp b/icd/api/vk_utils.cpp index 7cea34ff..32bbe13b 100644 --- a/icd/api/vk_utils.cpp +++ b/icd/api/vk_utils.cpp @@ -45,6 +45,42 @@ uint32_t GetBuildTimeHash() } #if DEBUG +// ===================================================================================================================== +// If turned on and exe name is a match, this function spins idle until we have a debugger hooked. +void WaitIdleForDebugger( + bool waitIdleToggled, + const char* pWaitIdleExeName, + uint32_t debugTimeout) +{ + if (waitIdleToggled) + { + bool waitForDebugger = false; + + if (strlen(pWaitIdleExeName) == 0) + { + // No executable name specified, apply on all Vulkan applications + waitForDebugger = true; + } + else + { + // Apply if executable name is a match + char appName[PATH_MAX]; + char appPath[PATH_MAX]; + utils::GetExecutableNameAndPath(appName, appPath); + + waitForDebugger = strcmp(pWaitIdleExeName, &appName[0]) == 0; + } + + if (waitForDebugger) + { + // Timeout the driver to give debuggers a chance to load all of the symbols + if (debugTimeout != 0) + { + Util::SleepMs(debugTimeout); + } + } + } +} #endif } // namespace utils diff --git a/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h b/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h index 6e827ce8..0bb32392 100644 --- a/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h +++ b/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h @@ -403,23 +403,6 @@ CMP_STATIC CGU_UINT32 cmp_clampui32(CMP_IN CGU_UINT32 v, CMP_IN CGU_UINT32 a, CM return v; } - -//# Half (in Hex) Float Comment -//# --------------------------------------------------------------------------- -//# 0001 (approx) = 0.000000059604645 smallest positive subnormal number -//# 03ff (approx) = 0.000060975552 largest subnormal number -//# 0400 (approx) = 0.00006103515625 smallest positive normal number -//# 7bff (approx) = 65504 largest normal number -//# 3bff (approx) = 0.99951172 largest number less than one -//# 3c00 (approx) = 1.00097656 smallest number larger than one -//# 3555 = 0.33325195 the rounding of 1/3 to nearest -//# c000 = ?2 -//# 8000 = -0 -//# 0000 = 0 -//# 7c00 = infinity -//# fc00 = infinity -//# Half Float Math - CMP_STATIC CGU_FLOAT HalfToFloat(CGU_UINT32 h) { #if defined(ASPM_GPU) diff --git a/icd/layers/include/vk_layer_switchable_graphics.h b/icd/layers/include/vk_layer_switchable_graphics.h index 0d122453..fcd0393e 100644 --- a/icd/layers/include/vk_layer_switchable_graphics.h +++ b/icd/layers/include/vk_layer_switchable_graphics.h @@ -52,7 +52,8 @@ struct NextLinkFuncPointers PFN_vkEnumeratePhysicalDeviceGroupsKHR pfnEnumeratePhysicalDeviceGroupsKHR; }; -typedef Util::HashMap DispatchTableHashMap; +typedef Util::HashMap, 256> DispatchTableHashMap; typedef VkResult (VKAPI_PTR *PFN_vkCreateInstance_SG)( const VkInstanceCreateInfo* pCreateInfo, diff --git a/icd/res/ver.h b/icd/res/ver.h index ef839867..0b234f03 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 262 +#define VULKAN_ICD_BUILD_VERSION 267 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION @@ -45,11 +45,11 @@ // These values specify the driver ID and driver info string #define VULKAN_DRIVER_ID VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR // "AMDOPEN" #define VULKAN_DRIVER_NAME_STR "AMD open-source driver" -#define VULKAN_DRIVER_INFO_STR "2023.Q2.1" +#define VULKAN_DRIVER_INFO_STR "2023.Q2.2" #define VULKAN_DRIVER_INFO_STR_LLPC "(LLPC)" // These values tell which version of the conformance test the driver is compliant against #define CTS_VERSION_MAJOR 1 #define CTS_VERSION_MINOR 3 -#define CTS_VERSION_SUBMINOR 0 -#define CTS_VERSION_PATCH 0 +#define CTS_VERSION_SUBMINOR 3 +#define CTS_VERSION_PATCH 1 diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index 53eb6e2f..db12d65d 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -264,6 +264,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } + { + m_settings.disableImplicitInvariantExports = false; + } + #if VKI_BUILD_GFX11 if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { @@ -1062,14 +1066,17 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( #if VKI_BUILD_GFX11 else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) { + // Navi31 Mall and Tiling Settings + if (pInfo->revision == Pal::AsicRevision::Navi31) + { + // Mall no alloc settings give a ~1% gain + m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr; + m_settings.mallNoAllocCtSsrPolicy = MallNoAllocCtSsrAsSnsr; + m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr; - // Mall no alloc settings give a ~1% gain - m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr; - m_settings.mallNoAllocCtSsrPolicy = MallNoAllocCtSsrAsSnsr; - m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr; - - // This provides ~6% gain at 4k - m_settings.imageTilingPreference3dGpuWritable = Pal::ImageTilingPattern::YMajor; + // This provides ~6% gain at 4k + m_settings.imageTilingPreference3dGpuWritable = Pal::ImageTilingPattern::YMajor; + } } #endif } @@ -1254,6 +1261,21 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( #endif } + if (appProfile == AppProfile::RomeRemastered) + { +#if VKI_BUILD_GFX11 + if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) + { + pPalSettings->pwsMode = Pal::PwsMode::NoLateAcquirePoint; + } +#endif + } + + if (appProfile == AppProfile::Zink) + { + m_settings.padVertexBuffers = true; + } + pAllocCb->pfnFree(pAllocCb->pUserData, pInfo); } diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index 9ad3d91e..35d442fd 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -2305,6 +2305,18 @@ "Scope": "Driver", "Type": "uint64" }, + { + "Name": "DumpPipelineWithApiHash", + "Description": "Use PSO api hash as pipeline dump file name", + "Tags": [ + "SPIRV Options" + ], + "Defaults": { + "Default": false + }, + "Scope": "Driver", + "Type": "bool" + }, { "Description": "If true, duplicate pipelines will be dumped to a file with a numeric suffix attached to the filename to distinguish each copy of the pipeline.", "Tags": [ @@ -3608,7 +3620,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": false + "Default": true }, "Type": "bool", "VariableName": "rtEnableBuildParallel", @@ -3646,6 +3658,21 @@ "Scope": "Driver", "VariableName": "rtEnableAcquireReleaseInterface" }, + { + "Name": "EnableFusedInstanceNode", + "Description": "Enable fused instance node for BVH builder", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "DispatchRaysThreadGroupSize", "Type": "uint32", @@ -6617,6 +6644,18 @@ "Type": "uint32", "Name": "CpDmaCmdCopyMemoryMaxBytes" }, + { + "Description": "Disables all implicit invariant marking of exports, which in turn disables MUL/ADD -> FMA. This option is legal but may cause issues if applications are sensitive to FMA influencing some export results.", + "Tags": [ + "Optimization" + ], + "Defaults": { + "Default": false + }, + "Scope": "Driver", + "Type": "bool", + "Name": "DisableImplicitInvariantExports" + }, { "Description": "This value denotes whether using CmdClearBoundAttachments/CmdClearBoundDepthStencilTargets for subpass load op clears or not.", "Tags": [ @@ -6786,7 +6825,7 @@ "Optimization" ], "Defaults": { - "Default": 29 + "Default": 30 }, "Scope": "Driver", "Type": "uint32", @@ -8106,6 +8145,52 @@ "VariableName": "rtMaxRayLength", "Scope": "Driver" }, + { + "Description": "Used to wait idle on vkCreateInstance() call until a debugger is attached to running application. Effective only on Windows debug builds.", + "Tags": [ + "Debugging" + ], + "BuildTypes": [ + "DEBUG" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Name": "WaitForDebugger", + "Scope": "Driver" + }, + { + "Description": "Executable name of a Vulkan application (e.g. vkcube.exe) upon which to wait idle until a debugger is attached. If empty, it applies to all Vulkan applications", + "Tags": [ + "Debugging" + ], + "BuildTypes": [ + "DEBUG" + ], + "Defaults": { + "Default": "" + }, + "Scope": "Driver", + "Name": "WaitForDebuggerExecutableName", + "Type": "string", + "Size": 256 + }, + { + "Description": "Timeout the driver in millisecond to give debuggers a chance to load all of the symbols", + "Tags": [ + "Debugging" + ], + "BuildTypes": [ + "DEBUG" + ], + "Defaults": { + "Default": 0 + }, + "Scope": "Driver", + "Name": "DebugTimeout", + "Type": "uint32" + }, { "Description": "Enable printf debug functionality", "Tags": [