From 02e867e81626b9566c5cc24f134ec0760e93a94a Mon Sep 17 00:00:00 2001
From: wenqinli <Wenqing.Li@amd.com>
Date: Thu, 18 May 2023 23:46:17 +0800
Subject: [PATCH] Update xgl from commit 2aeb0b25

* Hook up resources for ray history traces via PAL trace
* Initial changes for VK_EXT_device_address_binding_report
* Update PAL Version in XGL 796
* Update Khronos Vulkan Headers to 1.3.250
* Refine pipeline cache in GPL
* Fix push descriptor spilled to spill table
* The api hash dump name should be set false as default
* Support new dynamic state vertexBufferCount
* Refine Pipeline dump
* Check whether the image view of the color/depth/stencil attachment is a null handle
* Fix GPL fast link fail on nv2x
* Link enableImplicitInvariantExports to driver-level disableImplicitInvariantExports
* [DebugPrintf]Fixed the case to read entryHeader
* [DebugPrintf]Fixed the pPtr header
* Support BinaryInfo in ShaderEarlyCompileInfo for GPL caching
* Enable PalDeveloperCb immediately after the Pal Platform is created
* Fix ColorspaceHelper's lookup table so it only reports formats that are legal
* Bump Gpurt Version to 33
* Add astc hdr support
* [CTS_Next] dEQP-VK.pipeline.*.extended_dynamic_state.*.large_static_rasterization_samples_* - tests crash
* [CTS_Next] dEQP-VK.pipeline.*.extended_dynamic_state.*color_blend_dual* - tests fail
* Update supported CTS to 1.3.3.1
* Expose VK_EXT_mutable_descriptor_type
* Add EnableFusedInstanceNode
* Navi3x tuning for Rage2
* Add appProfile for SOTTR
* Fix Noisy Asserts & Alerts
* Correlation information does not come for some binds
* Mutable descriptor type, fix array element pointer calculation bug
* Remove descriptor buffer memory type for Images
* Potential crash when calculating pipeline cache id
* Force initialization of disableImplicitInvariantExports in LLPC builds
* Expose disableImplicitInvariantExports to LLPC
* Switch to Parallel Build Path
* Add sleep to help wait for debugger to attach
* Corruption Observed in Unigine Heaven/Unigine Valley Using Zink
* SE 5 Radeon Relive TDR
* HDR is not supported on Doom Eternal
* Config emulatedRtIp in raytracing device init
* Fix issues in the DebugPrintf
* [Navi31][TotalWar:Rome Remastered]Random corruption in gameplay
* Yuzu - Metroid Prime Remastered: Corruption & Crash on game load
---
 cmake/XglVersions.cmake                       |   4 +-
 icd/Loader/LunarG/Lnx/amd-icd.json            |   4 +-
 icd/api/app_profile.cpp                       |  31 +
 icd/api/app_shader_optimizer.cpp              |   3 +-
 .../{Navi31 => generic}/Rage2/profile.json    |   0
 icd/api/color_space_helper.cpp                |   6 +-
 icd/api/compiler_solution.cpp                 |   6 +-
 icd/api/compiler_solution_llpc.cpp            | 110 +--
 icd/api/debug_printf.cpp                      | 156 ++--
 icd/api/entry.cpp                             |   1 +
 icd/api/gpumemory_event_handler.cpp           | 736 ++++++++++++++----
 icd/api/graphics_pipeline_common.cpp          |  15 +-
 icd/api/include/app_profile.h                 |   2 +
 icd/api/include/compiler_solution.h           |  21 +-
 icd/api/include/compiler_solution_llpc.h      |  16 +-
 icd/api/include/gpumemory_event_handler.h     | 176 ++++-
 .../khronos/sdk-1.3/vulkan/vulkan_core.h      |  51 +-
 icd/api/include/log.h                         |   7 +-
 icd/api/include/pipeline_compiler.h           |   7 +-
 icd/api/include/vk_cmdbuffer.h                |  11 +-
 icd/api/include/vk_conv.h                     |  83 +-
 icd/api/include/vk_deferred_operation.h       |   2 +-
 icd/api/include/vk_defines.h                  |   2 +
 icd/api/include/vk_device.h                   |   9 +-
 icd/api/include/vk_extensions.h               |   2 +
 .../include/vk_graphics_pipeline_library.h    |   1 +
 icd/api/include/vk_physical_device.h          |  17 -
 icd/api/include/vk_pipeline.h                 |   3 +-
 icd/api/include/vk_pipeline_cache.h           |  19 +-
 icd/api/include/vk_shader.h                   |   2 +-
 icd/api/include/vk_utils.h                    |   3 +
 icd/api/internal_mem_mgr.cpp                  |  45 +-
 icd/api/pipeline_compiler.cpp                 | 301 +++----
 icd/api/raytrace/ray_tracing_device.cpp       | 148 +++-
 icd/api/raytrace/ray_tracing_device.h         |  39 +
 icd/api/raytrace/vk_ray_tracing_pipeline.cpp  |  15 +-
 icd/api/raytrace/vk_ray_tracing_pipeline.h    |   3 +-
 icd/api/strings/entry_points.txt              |   1 +
 icd/api/strings/extensions.txt                |   2 +
 icd/api/vk_cmdbuffer.cpp                      | 514 ++++++------
 icd/api/vk_compute_pipeline.cpp               |   8 +-
 icd/api/vk_conv.cpp                           |  18 +
 icd/api/vk_descriptor_pool.cpp                |   5 +-
 icd/api/vk_descriptor_set.cpp                 |  27 +-
 icd/api/vk_device.cpp                         | 105 ++-
 icd/api/vk_dispatch.cpp                       |   1 +
 icd/api/vk_graphics_pipeline.cpp              |  13 +-
 icd/api/vk_graphics_pipeline_library.cpp      |  22 +-
 icd/api/vk_image.cpp                          |   3 +
 icd/api/vk_instance.cpp                       |  15 +-
 icd/api/vk_memory.cpp                         |  13 +-
 icd/api/vk_physical_device.cpp                | 106 +--
 icd/api/vk_pipeline.cpp                       |   9 +-
 icd/api/vk_pipeline_cache.cpp                 | 203 +----
 icd/api/vk_pipeline_layout.cpp                |  37 +-
 icd/api/vk_query.cpp                          |  59 +-
 icd/api/vk_queue.cpp                          |   2 -
 icd/api/vk_shader.cpp                         |   2 +-
 icd/api/vk_utils.cpp                          |  36 +
 .../shaders/bc3-encode-hlsl/bcn_common_api.h  |  17 -
 .../include/vk_layer_switchable_graphics.h    |   3 +-
 icd/res/ver.h                                 |   8 +-
 icd/settings/settings.cpp                     |  36 +-
 icd/settings/settings_xgl.json                |  89 ++-
 64 files changed, 2178 insertions(+), 1233 deletions(-)
 rename icd/api/appopt/shader_profiles/llpc/gfxIp11_0/{Navi31 => generic}/Rage2/profile.json (100%)

diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake
index f1a7ae0b..3248547c 100644
--- a/cmake/XglVersions.cmake
+++ b/cmake/XglVersions.cmake
@@ -28,7 +28,7 @@ include_guard()
 # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION.  It describes the version of the PAL interface
 # that the ICD supports.  PAL uses this value to enable backwards-compatibility for older interface versions.
 # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h.
-set(ICD_PAL_CLIENT_MAJOR_VERSION "792")
+set(ICD_PAL_CLIENT_MAJOR_VERSION "796")
 
 # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1.
 # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports.
@@ -37,7 +37,7 @@ set(ICD_GPUOPEN_CLIENT_MAJOR_VERSION "42")
 #if VKI_RAY_TRACING
 # This will become the value of GPURT_CLIENT_INTERFACE_MAJOR_VERSION if VKI_RAY_TRACING=1.
 # It describes the interface version of the GpuRT shared module that the ICD supports.
-set(ICD_GPURT_CLIENT_MAJOR_VERSION "32")
+set(ICD_GPURT_CLIENT_MAJOR_VERSION "33")
 #endif
 
 # This will become the value of LLPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_LLPC=1.
diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json
index 56a0a745..b7b08e15 100644
--- a/icd/Loader/LunarG/Lnx/amd-icd.json
+++ b/icd/Loader/LunarG/Lnx/amd-icd.json
@@ -2,13 +2,13 @@
   "file_format_version": "1.0.0",
   "ICD": {
     "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so",
-    "api_version": "1.3.246"
+    "api_version": "1.3.250"
   },
   "layer": {
     "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@",
     "type": "GLOBAL",
     "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so",
-    "api_version": "1.3.246",
+    "api_version": "1.3.250",
     "implementation_version": "1",
     "description": "AMD switchable graphics layer",
     "functions": {
diff --git a/icd/api/app_profile.cpp b/icd/api/app_profile.cpp
index 783b914d..25b90abf 100644
--- a/icd/api/app_profile.cpp
+++ b/icd/api/app_profile.cpp
@@ -207,6 +207,12 @@ constexpr AppProfilePatternEntry AppNameSeriousSam4Win =
     "serious sam 4 - 64bit"
 };
 
+constexpr AppProfilePatternEntry AppNameRomeRemasteredLinux =
+{
+    PatternAppNameLower,
+    "rome"
+};
+
 constexpr AppProfilePatternEntry AppEngineSedp =
 {
     PatternEngineNameLower,
@@ -375,6 +381,13 @@ constexpr AppProfilePatternEntry AppEngineSaschaWillemsExamples =
     "vulkanexample"
 };
 
+//steam version of shadow of tomb raider
+constexpr AppProfilePatternEntry AppNameSOTTR =
+{
+    PatternAppNameLower,
+    "sottr.exe"
+};
+
 #if VKI_RAY_TRACING
 constexpr AppProfilePatternEntry AppEngineVKD3D =
 {
@@ -902,6 +915,15 @@ AppProfilePattern AppPatternTable[] =
         }
     },
 
+    {
+        AppProfile::RomeRemastered,
+        {
+            AppNameRomeRemasteredLinux,
+            AppEngineFeral3D,
+            PatternEnd
+        }
+    },
+
     {
         AppProfile::ThreeKingdoms,
         {
@@ -1161,6 +1183,15 @@ AppProfilePattern AppPatternTable[] =
         }
     },
 
+    {
+        AppProfile::SOTTR,
+        {
+            AppNameSOTTR,
+            AppEngineDXVK,
+            PatternEnd
+        }
+    },
+
 #if VKI_RAY_TRACING
     {
         AppProfile::ControlDX12,
diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp
index b989ed56..9ed4744c 100644
--- a/icd/api/app_shader_optimizer.cpp
+++ b/icd/api/app_shader_optimizer.cpp
@@ -167,7 +167,8 @@ void ShaderOptimizer::CalculateMatchingProfileEntriesHash(
                 pHasher->Update(shaderAction.pipelineShader);
                 pHasher->Update(shaderAction.shaderCreate);
 
-                if (shaderAction.shaderReplace.pCode != nullptr)
+                if (shaderAction.shaderCreate.apply.shaderReplaceEnabled &&
+                    (shaderAction.shaderReplace.pCode != nullptr))
                 {
                     pHasher->Update(
                         static_cast<const uint8_t*>(shaderAction.shaderReplace.pCode),
diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Rage2/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/Rage2/profile.json
similarity index 100%
rename from icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Rage2/profile.json
rename to icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/Rage2/profile.json
diff --git a/icd/api/color_space_helper.cpp b/icd/api/color_space_helper.cpp
index 9214cd76..93a36a44 100644
--- a/icd/api/color_space_helper.cpp
+++ b/icd/api/color_space_helper.cpp
@@ -51,12 +51,12 @@ struct LookupDefines
 
 const LookupDefines colorspaceLookup[] =
 {
-    { Pal::ScreenColorSpace::CsSrgb,         VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,       FmtSupport::Fmt_All          },
+    { Pal::ScreenColorSpace::CsSrgb,         VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,       FmtSupport::Fmt_8bpc         },
     { Pal::ScreenColorSpace::CsBt709,        VK_COLOR_SPACE_BT709_NONLINEAR_EXT,      FmtSupport::Fmt_All          },
     { Pal::ScreenColorSpace::TfHlg,          VK_COLOR_SPACE_HDR10_HLG_EXT,            FmtSupport::Fmt_KnownHDR     },
-    { Pal::ScreenColorSpace::TfPq2084,       VK_COLOR_SPACE_HDR10_ST2084_EXT,         FmtSupport::Fmt_KnownHDR     },
+    { Pal::ScreenColorSpace::TfPq2084,       VK_COLOR_SPACE_HDR10_ST2084_EXT,         FmtSupport::Fmt_10bpc        },
     { Pal::ScreenColorSpace::TfDolbyVision,  VK_COLOR_SPACE_DOLBYVISION_EXT,          FmtSupport::Fmt_8bpc_unorm   },
-    { Pal::ScreenColorSpace::CsBt2020,       VK_COLOR_SPACE_BT2020_LINEAR_EXT,        FmtSupport::Fmt_KnownHDR     },
+    { Pal::ScreenColorSpace::CsBt2020,       VK_COLOR_SPACE_BT2020_LINEAR_EXT,        FmtSupport::Fmt_10bpc        },
     { Pal::ScreenColorSpace::CsAdobe,        VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT,      FmtSupport::Fmt_All          },
     { Pal::ScreenColorSpace::CsDciP3,        VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT,     FmtSupport::Fmt_All          },
     { Pal::ScreenColorSpace::CsScrgb,        VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT, FmtSupport::Fmt_16bpc_sfloat },
diff --git a/icd/api/compiler_solution.cpp b/icd/api/compiler_solution.cpp
index d9e8414e..d41f9c29 100644
--- a/icd/api/compiler_solution.cpp
+++ b/icd/api/compiler_solution.cpp
@@ -58,9 +58,9 @@ CompilerSolution::~CompilerSolution()
 // =====================================================================================================================
 // Initialize CompilerSolution class
 VkResult CompilerSolution::Initialize(
-    Vkgc::GfxIpVersion gfxIp,
-    Pal::GfxIpLevel    gfxIpLevel,
-    Vkgc::ICache*      pCache)
+    Vkgc::GfxIpVersion   gfxIp,
+    Pal::GfxIpLevel      gfxIpLevel,
+    PipelineBinaryCache* pCache)
 {
     m_gfxIp      = gfxIp;
     m_gfxIpLevel = gfxIpLevel;
diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp
index 72d660a4..9076938b 100644
--- a/icd/api/compiler_solution_llpc.cpp
+++ b/icd/api/compiler_solution_llpc.cpp
@@ -59,18 +59,21 @@ CompilerSolutionLlpc::~CompilerSolutionLlpc()
 // =====================================================================================================================
 // Initialize CompilerSolutionLlpc class
 VkResult CompilerSolutionLlpc::Initialize(
-    Vkgc::GfxIpVersion gfxIp,
-    Pal::GfxIpLevel    gfxIpLevel,
-    Vkgc::ICache*      pCache)
+    Vkgc::GfxIpVersion    gfxIp,
+    Pal::GfxIpLevel       gfxIpLevel,
+    PipelineBinaryCache*  pCache)
 {
     const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings();
-    Vkgc::ICache* pInternalCache = pCache;
-    if (settings.shaderCacheMode == ShaderCacheDisable)
+    Vkgc::ICache* pInternalCache = nullptr;
+    if (pCache != nullptr)
     {
-        pInternalCache = nullptr;
+        if (settings.shaderCacheMode != ShaderCacheDisable)
+        {
+            pInternalCache = pCache->GetCacheAdapter();
+        }
     }
 
-    VkResult result = CompilerSolution::Initialize(gfxIp, gfxIpLevel, pInternalCache);
+    VkResult result = CompilerSolution::Initialize(gfxIp, gfxIpLevel, pCache);
 
     if (result == VK_SUCCESS)
     {
@@ -91,32 +94,12 @@ void CompilerSolutionLlpc::Destroy()
     }
 }
 
-// =====================================================================================================================
-// Get size of shader cache object
-size_t CompilerSolutionLlpc::GetShaderCacheSize(
-    PipelineCompilerType cacheType)
-{
-    VK_NEVER_CALLED();
-    return 0;
-}
-
-// =====================================================================================================================
-// Creates shader cache object.
-VkResult CompilerSolutionLlpc::CreateShaderCache(
-    const void*  pInitialData,
-    size_t       initialDataSize,
-    void*        pShaderCacheMem,
-    uint32_t     expectedEntries,
-    ShaderCache* pShaderCache)
-{
-    return VK_ERROR_INITIALIZATION_FAILED;
-}
-
 // =====================================================================================================================
 // Builds shader module from SPIR-V binary code.
 VkResult CompilerSolutionLlpc::BuildShaderModule(
     const Device*                pDevice,
     VkShaderModuleCreateFlags    flags,
+    VkShaderModuleCreateFlags    internalShaderFlags,
     size_t                       codeSize,
     const void*                  pCode,
     const bool                   adaptForFastLink,
@@ -142,7 +125,7 @@ VkResult CompilerSolutionLlpc::BuildShaderModule(
     pPipelineCompiler->ApplyPipelineOptions(pDevice, 0, &moduleInfo.options.pipelineOptions);
 
 #if VKI_RAY_TRACING
-    if ((flags & VK_SHADER_MODULE_RAY_TRACING_INTERNAL_SHADER_BIT) != 0)
+    if ((internalShaderFlags & VK_INTERNAL_SHADER_FLAGS_RAY_TRACING_INTERNAL_SHADER_BIT) != 0)
     {
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 55
         moduleInfo.options.pipelineOptions.internalRtShaders = true;
@@ -183,11 +166,6 @@ void CompilerSolutionLlpc::FreeShaderModule(ShaderModuleHandle* pShaderModule)
     auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
 
     pInstance->FreeMem(pShaderModule->pLlpcShaderModule);
-
-    if (pShaderModule->elfPackage.codeSize > 0)
-    {
-        pInstance->FreeMem(const_cast<void*>(pShaderModule->elfPackage.pCode));
-    }
 }
 
 // =====================================================================================================================
@@ -377,36 +355,72 @@ VkResult CompilerSolutionLlpc::CreateGraphicsPipelineBinary(
 // Build ElfPackage for a specific shader module based on pipeine information
 VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary(
     const Device*                     pDevice,
+    PipelineCache*                    pPipelineCache,
     const ShaderStage                 stage,
     GraphicsPipelineBinaryCreateInfo* pCreateInfo,
     void*                             pPipelineDumpHandle,
     ShaderModuleHandle*               pShaderModule)
 {
     VkResult result = VK_SUCCESS;
+    Util::MetroHash::Hash cacheId = {};
 
-    // Build the LLPC pipeline
-    Llpc::GraphicsPipelineBuildOut  pipelineOut = {};
-
-    Vkgc::UnlinkedShaderStage unlinkedStage = UnlinkedStageCount;
-
-    // Belong to vertexProcess stage before fragment
-    if (stage < ShaderStage::ShaderStageFragment)
+    bool hitCache = false;
+    if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCache() != nullptr))
     {
-        unlinkedStage = UnlinkedShaderStage::UnlinkedStageVertexProcess;
+        Vkgc::BinaryData elfPackage = {};
+        Util::MetroHash128 hasher;
+        hasher.Update(pCreateInfo->libraryHash[stage]);
+        hasher.Update(m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash());
+        hasher.Finalize(cacheId.bytes);
+        auto pAppCache = pPipelineCache->GetPipelineCache();
+        hitCache = (pAppCache->LoadPipelineBinary(&cacheId, &elfPackage.codeSize, &elfPackage.pCode)
+            == Util::Result::Success);
+        pShaderModule->elfPackage = elfPackage;
     }
-    else if (stage == ShaderStage::ShaderStageFragment)
+
+    if (hitCache == false)
     {
-        unlinkedStage = UnlinkedShaderStage::UnlinkedStageFragment;
-    }
+        // Build the LLPC pipeline
+        Llpc::GraphicsPipelineBuildOut  pipelineOut = {};
+        Vkgc::UnlinkedShaderStage unlinkedStage = UnlinkedStageCount;
 
-    auto llpcResult = m_pLlpc->buildGraphicsShaderStage(
+        // Belong to vertexProcess stage before fragment
+        if (stage < ShaderStage::ShaderStageFragment)
+        {
+            unlinkedStage = UnlinkedShaderStage::UnlinkedStageVertexProcess;
+        }
+        else if (stage == ShaderStage::ShaderStageFragment)
+        {
+            unlinkedStage = UnlinkedShaderStage::UnlinkedStageFragment;
+        }
+
+        auto llpcResult = m_pLlpc->buildGraphicsShaderStage(
             &pCreateInfo->pipelineInfo,
             &pipelineOut,
             unlinkedStage,
             pPipelineDumpHandle);
-    if (llpcResult == Vkgc::Result::Success)
+        if (llpcResult == Vkgc::Result::Success)
+        {
+            pShaderModule->elfPackage = pipelineOut.pipelineBin;
+            if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCache() != nullptr))
+            {
+                pPipelineCache->GetPipelineCache()->StorePipelineBinary(
+                    &cacheId, pipelineOut.pipelineBin.codeSize, pipelineOut.pipelineBin.pCode);
+            }
+        }
+        else
+        {
+
+            result = (llpcResult == Vkgc::Result::ErrorOutOfMemory) ?
+                VK_ERROR_OUT_OF_HOST_MEMORY : VK_ERROR_INITIALIZATION_FAILED;
+
+        }
+    }
+
+    if (result == VK_SUCCESS)
     {
-        pShaderModule->elfPackage = pipelineOut.pipelineBin;
+        pCreateInfo->earlyElfPackage[stage]     = pShaderModule->elfPackage;
+        pCreateInfo->earlyElfPackageHash[stage] = cacheId;
     }
 
     return result;
diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp
index ef5f432c..13ce0b3e 100644
--- a/icd/api/debug_printf.cpp
+++ b/icd/api/debug_printf.cpp
@@ -114,23 +114,19 @@ void DebugPrintf::BindPipeline(
             srdInfo.range = m_printfMemory.Size();
             pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds(1, &srdInfo, pTable);
             m_frame = 0;
-        }
-    }
-
-    if (m_state == MemoryAllocated)
-    {
-        const Pal::uint32* pEntry = reinterpret_cast<const Pal::uint32*>(&tableVa);
-        pCmdBuffer->CmdSetUserData(static_cast<Pal::PipelineBindPoint>(bindPoint), userDataOffset, 1, pEntry);
+            const Pal::uint32* pEntry = reinterpret_cast<const Pal::uint32*>(&tableVa);
+            pCmdBuffer->CmdSetUserData(static_cast<Pal::PipelineBindPoint>(bindPoint), userDataOffset, 1, pEntry);
 
-        m_parsedFormatStrings.Reset();
-        for (auto it = pPipeline->GetFormatStrings().Begin(); it.Get() != nullptr; it.Next())
-        {
-            bool found = true;
-            PrintfSubSection* pSubSections = nullptr;
-            m_parsedFormatStrings.FindAllocate(it.Get()->key, &found, &pSubSections);
-            VK_ASSERT(found == false);
-            pSubSections->Reserve(1);
-            ParseFormatStringsToSubSection(it.Get()->value.printStr, pSubSections);
+            m_parsedFormatStrings.Reset();
+            for (auto it = pPipeline->GetFormatStrings().Begin(); it.Get() != nullptr; it.Next())
+            {
+                bool found = true;
+                PrintfSubSection* pSubSections = nullptr;
+                m_parsedFormatStrings.FindAllocate(it.Get()->key, &found, &pSubSections);
+                VK_ASSERT(found == false);
+                pSubSections->Reserve(1);
+                ParseFormatStringsToSubSection(it.Get()->value.printStr, pSubSections);
+            }
         }
     }
 }
@@ -169,7 +165,8 @@ Pal::Result DebugPrintf::PostQueueProcess(
     uint64_t bufferSize = 0;
     uint32_t* pPrintBuffer = nullptr;
     uint32_t* pPtr = nullptr;
-    uint64_t maxBufferDWSize = m_printfMemory.Size() >> 2;
+    constexpr uint32_t bufferHeaderSize = 4;
+    uint64_t maxBufferDWSize = (m_printfMemory.Size() >> 2) - bufferHeaderSize;
     if (palResult == Pal::Result::Success)
     {
         // Buffer Header is 4 dword {BufferOffset_Loword, BufferOffset_Hiword, rerv0, rerv1};
@@ -179,76 +176,83 @@ Pal::Result DebugPrintf::PostQueueProcess(
         pPtr += 2;
         bufferSize = (static_cast<uint64_t>(bufferSizeHigh) << 32) | static_cast<uint64_t>(bufferSizeLower);
         bufferSize = Util::Min(bufferSize, maxBufferDWSize);
+        if (bufferSize > 0)
+        {
+            pPrintBuffer = static_cast<uint32_t*>(pDevice->VkInstance()->AllocMem(
+                bufferSize * sizeof(uint32_t), 4, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND));
 
-        pPrintBuffer = static_cast<uint32_t*>(pDevice->VkInstance()->AllocMem(
-            bufferSize * sizeof(uint32_t), 4, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND));
-
-        memcpy(pPrintBuffer, pPtr, bufferSize * 4);
+            memcpy(pPrintBuffer, pPtr, bufferSize * 4);
+        }
 
         m_printfMemory.Unmap(deviceIdx);
 
-        const auto& formatStrings = m_pPipeline->GetFormatStrings();
-        const uint32_t entryHeaderSize = 2;
-        uint64_t decodeOffset = 0;
-        PrintfString outputBufferStr(nullptr);
-        outputBufferStr.Reserve(10);
-        Vector<PrintfString, 5, GenericAllocator> outputDecodedSpecifiers(nullptr);
-        outputDecodedSpecifiers.Reserve(5);
-        while (decodeOffset < bufferSize)
+        if (bufferSize > 0)
         {
-            // Decode entry
-            uint32_t entryHeaderLow = *pPtr++;
-            uint32_t entryHeaderHigh = *pPtr++;
-            uint64_t entryHeader = ((uint64_t)(entryHeaderHigh) << 32) | uint64_t(entryHeaderLow);
-            // 64 bit header {[0:15], [16:63]} entrySize,hash value for the string
-            uint64_t entrySize = (entryHeader & 65535);
-            uint64_t entryHashValue = entryHeader >> 16;
-
-            decodeOffset += entryHeaderSize;
-            // Check hash value in the entry valid
-            auto pEntry = formatStrings.FindKey(entryHashValue);
-            if (pEntry == nullptr)
+            const auto& formatStrings = m_pPipeline->GetFormatStrings();
+            const uint32_t entryHeaderSize = 2;
+            uint64_t decodeOffset = 0;
+            PrintfString outputBufferStr(nullptr);
+            outputBufferStr.Reserve(10);
+            Vector<PrintfString, 5, GenericAllocator> outputDecodedSpecifiers(nullptr);
+            outputDecodedSpecifiers.Reserve(5);
+            // Set pPtr point to the head of the system memory
+            pPtr = pPrintBuffer;
+            while ((bufferSize - decodeOffset) > 1)
             {
-                break;
-            }
+                // Decode entry
+                uint32_t entryHeaderLow = *pPtr++;
+                uint32_t entryHeaderHigh = *pPtr++;
+                uint64_t entryHeader = ((uint64_t)(entryHeaderHigh) << 32) | uint64_t(entryHeaderLow);
+                // 64 bit header {[0:15], [16:63]} entrySize,hash value for the string
+                uint64_t entryValuesSize = (entryHeader & 65535) - entryHeaderSize;
+                uint64_t entryHashValue = entryHeader >> 16;
+
+                decodeOffset += entryHeaderSize;
+                // Check hash value in the entry valid and if there is space to decoded entry values
+                auto pEntry = formatStrings.FindKey(entryHashValue);
+                if ((pEntry == nullptr) || ((bufferSize - decodeOffset) < entryValuesSize))
+                {
+                    break;
+                }
 
-            const PrintfString& formatString = pEntry->printStr;
-            const PrintfBit& bitPos = pEntry->bit64s;
-            PrintfSubSection* pSubSections = m_parsedFormatStrings.FindKey(entryHashValue);
-            int initSize = bitPos.size() - outputDecodedSpecifiers.size();
-            for (int i = 0; i < initSize; ++i)
-            {
-                outputDecodedSpecifiers.PushBack(nullptr);
-            }
+                const PrintfString& formatString = pEntry->printStr;
+                const PrintfBit& bitPos = pEntry->bit64s;
+                PrintfSubSection* pSubSections = m_parsedFormatStrings.FindKey(entryHashValue);
+                int initSize = bitPos.size() - outputDecodedSpecifiers.size();
+                for (int i = 0; i < initSize; ++i)
+                {
+                    outputDecodedSpecifiers.PushBack(nullptr);
+                }
 
-            // Get printf output variable in dword size
-            unsigned outputsInDwords = 0;
-            uint64_t outputVar;
-            for (uint32_t varIndex = 0; varIndex < bitPos.size(); varIndex++)
-            {
-                outputVar = *pPtr++;
-                outputsInDwords++;
-                bool is64bit = bitPos[varIndex];
-                if (is64bit)
+                // Get printf output variable in dword size
+                unsigned outputsInDwords = 0;
+                uint64_t outputVar;
+                for (uint32_t varIndex = 0; varIndex < bitPos.size(); varIndex++)
                 {
-                    uint64_t hiDword = *pPtr++;
-                    outputVar = (hiDword << 32) | outputVar;
+                    outputVar = *pPtr++;
                     outputsInDwords++;
-                }
+                    bool is64bit = bitPos[varIndex];
+                    if (is64bit)
+                    {
+                        uint64_t hiDword = *pPtr++;
+                        outputVar = (hiDword << 32) | outputVar;
+                        outputsInDwords++;
+                    }
 
-                DecodeSpecifier(formatString,
-                                outputVar,
-                                is64bit,
-                                pSubSections,
-                                varIndex,
-                                &outputDecodedSpecifiers[varIndex]);
+                    DecodeSpecifier(formatString,
+                                    outputVar,
+                                    is64bit,
+                                    pSubSections,
+                                    varIndex,
+                                    &outputDecodedSpecifiers[varIndex]);
+                }
+                OutputBufferString(formatString, *pSubSections, &outputBufferStr);
+                decodeOffset += outputsInDwords;
             }
-            OutputBufferString(formatString, *pSubSections, &outputBufferStr);
-            decodeOffset += outputsInDwords;
+            WriteToFile(outputBufferStr);
+            pDevice->VkInstance()->FreeMem(pPrintBuffer);
+            m_frame++;
         }
-        WriteToFile(outputBufferStr);
-        pDevice->VkInstance()->FreeMem(pPrintBuffer);
-        m_frame++;
     }
 
     return palResult;
@@ -276,7 +280,7 @@ void DebugPrintf::WriteToFile(
         const char* fileBeginPostfix =" Begin ========================\n";
         const char* fileEnd = "========================= Session End ========================\n";
         file.Write(fileBeginPrefix, strlen(fileBeginPrefix));
-        file.Write(fileName.Data(), fileName.NumElements());
+        file.Write(fileName.Data(), strlen(fileName.Data()));
         file.Write(fileBeginPostfix, strlen(fileBeginPostfix));
         result = file.Write(outputBuffer.Data(), outputBuffer.size());
         if (result == Util::Result::Success)
@@ -316,7 +320,7 @@ PrintfString DebugPrintf::GetFileName(
         AppendPrintfString(&fName, pDumpFolder, strlen(pDumpFolder));
         AppendPrintfString(&fName, "/", 1);
         AppendPrintfString(&fName, fileName, strlen(fileName));
-        AppendPrintfString(&fName, ".txt", 4);
+        AppendPrintfString(&fName, ".txt\0", 5);
     }
     return fName;
 }
diff --git a/icd/api/entry.cpp b/icd/api/entry.cpp
index 781cb33a..d59d93f7 100644
--- a/icd/api/entry.cpp
+++ b/icd/api/entry.cpp
@@ -102,6 +102,7 @@ VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer(
     ApiCmdBuffer::ObjectFromHandle(cmdBuffer)->BindIndexBuffer(
         buffer,
         offset,
+        VK_WHOLE_SIZE,
         indexType);
 }
 
diff --git a/icd/api/gpumemory_event_handler.cpp b/icd/api/gpumemory_event_handler.cpp
index 3a5c083b..d2d8a2c5 100644
--- a/icd/api/gpumemory_event_handler.cpp
+++ b/icd/api/gpumemory_event_handler.cpp
@@ -33,12 +33,18 @@
 #include "include/vk_instance.h"
 #include "include/vk_device.h"
 
+#include "palIntrusiveListImpl.h"
 #include "palHashMapImpl.h"
+#include "palHashSetImpl.h"
 #include "palVectorImpl.h"
 
 namespace vk
 {
 
+// Alloc, suballoc and bind entries have NullHandle assigned as object handle,
+// if the correlation information was not provided yet to gpu memory event handler
+constexpr uint64_t NullHandle = 0;
+
 // =====================================================================================================================
 GpuMemoryEventHandler::GpuMemoryEventHandler(Instance* pInstance)
     :
@@ -47,12 +53,16 @@ GpuMemoryEventHandler::GpuMemoryEventHandler(Instance* pInstance)
     m_allocationHashMap(32, pInstance->Allocator()),
     m_vulkanSubAllocationHashMap(32, pInstance->Allocator()),
     m_palSubAllocationHashMap(32, pInstance->Allocator()),
+    m_bindHashMap(32, pInstance->Allocator()),
+    m_deviceHashSet(32, pInstance->Allocator()),
     m_memoryObjectId(0),
-    m_memoryEventEnables(0)
+    m_deviceCount(0)
 {
     m_allocationHashMap.Init();
     m_vulkanSubAllocationHashMap.Init();
     m_palSubAllocationHashMap.Init();
+    m_deviceHashSet.Init();
+    m_bindHashMap.Init();
 }
 
 // =====================================================================================================================
@@ -86,6 +96,15 @@ void GpuMemoryEventHandler::Destroy()
     PAL_ALERT_MSG(m_vulkanSubAllocationHashMap.GetNumEntries() != 0, "Vulkan suballocations were not freed.");
     PAL_ALERT_MSG(m_palSubAllocationHashMap.GetNumEntries() != 0, "Pal suballocations were not freed.");
 
+    for (auto iter = m_bindHashMap.Begin(); iter.Get() != nullptr; iter.Next())
+    {
+        const BindDataList& bindDataList = iter.Get()->value;
+        if (bindDataList.IsEmpty() == false)
+        {
+            PAL_ALERT_MSG(bindDataList.IsEmpty() == false, "Memory binds map is not empty.");
+        }
+    }
+
     Util::Destructor(this);
 
     m_pInstance->FreeMem(this);
@@ -113,7 +132,10 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
             if (exists == false)
             {
                 // Store the allocation information
-                pAllocationData->allocationData = *pGpuMemoryData;
+                pAllocationData->allocationData               = *pGpuMemoryData;
+                pAllocationData->objectHandle                 = NullHandle;
+                pAllocationData->objectType                   = VK_OBJECT_TYPE_UNKNOWN;
+                pAllocationData->reportedToDeviceMemoryReport = false;
 
                 // If this is a Pal internal allocation that is not suballocated report it to device_memory_report now
                 if ((pAllocationData->allocationData.flags.isClient       == 0) && // Pal internal, not Vulkan
@@ -121,21 +143,18 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
                     (pAllocationData->allocationData.flags.buddyAllocated == 0) && // Buddy allocator is suballocated
                     (pAllocationData->allocationData.flags.isExternal     == 0))   // vkCreateMemory handles external
                 {
-                    m_callbacksLock.LockForRead();
-                    auto     iter            = m_callbacks.Begin();
-                    uint32_t heapIndex = 0;
-
-                    if (iter.IsValid())
-                    {
-                        auto*const pPhysicalDevice = (iter.Get().pDevice)->VkPhysicalDevice(DefaultDeviceIndex);
-                        bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap, &heapIndex);
-                        VK_ASSERT(validHeap);
-                    }
-                    m_callbacksLock.UnlockForRead();
-
-                    // The instance is the default Vulkan object for allocations not specifically tracked otherwise.
-                    pAllocationData->objectHandle = Instance::IntValueFromHandle(Instance::FromObject(m_pInstance));
-                    pAllocationData->objectType   = VK_OBJECT_TYPE_INSTANCE;
+                    // This is a Pal internal allocation that is not suballocated report it to device_memory_report now
+                    Util::RWLockAuto<RWLock::ReadOnly> deviceHashSetLock(&m_deviceHashSetLock);
+                    const Device*   pDevice         = m_deviceHashSet.Begin().Get()->key;
+                    PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex);
+                    uint32_t        heapIndex       = 0;
+                    bool            validHeap       = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap,
+                                                                                                 &heapIndex);
+                    VK_ASSERT(validHeap);
+                    // Physical device is the default Vulkan object for allocations not specifically tracked otherwise.
+                    auto* const pHandle                           = ApiPhysicalDevice::FromObject(pPhysicalDevice);
+                    pAllocationData->objectHandle                 = ApiPhysicalDevice::IntValueFromHandle(pHandle);
+                    pAllocationData->objectType                   = VK_OBJECT_TYPE_PHYSICAL_DEVICE;
                     pAllocationData->reportedToDeviceMemoryReport = true;
 
                     DeviceMemoryReportAllocateEvent(
@@ -163,11 +182,10 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
 
         if (pAllocationData != nullptr)
         {
-            // If this is a Pal internal free that is not suballocated report it to device_memory_report now
-            if ((pAllocationData->allocationData.flags.isClient       == 0) && // Pal internal, not Vulkan
-                (pAllocationData->allocationData.flags.isCmdAllocator == 0) && // Command allocator is suballocated
-                (pAllocationData->allocationData.flags.buddyAllocated == 0) && // Buddy allocator is suballocated
-                (pAllocationData->allocationData.flags.isExternal     == 0))   // vkCreateMemory handles external
+            // Report non-suballocated frees to device_memory_report and device_address_binding_report now,
+            // including Vulkan allocations
+            if ((pAllocationData->allocationData.flags.isCmdAllocator == 0) && // Command allocator is suballocated
+                (pAllocationData->allocationData.flags.buddyAllocated == 0))   // Buddy allocator is suballocated
             {
                 if (pAllocationData->reportedToDeviceMemoryReport == true)
                 {
@@ -177,10 +195,13 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
                         pAllocationData->allocationData.pGpuMemory->Desc().uniqueId,
                         pAllocationData->allocationData.flags.isExternal);
                 }
-                else
+                else if ((pAllocationData->allocationData.flags.isClient       != 1) &&
+                         (pAllocationData->allocationData.flags.isExternal     != 1))
                 {
                     PAL_ALERT_ALWAYS_MSG("Allocation freed that was never reported to device_memory_report");
                 }
+
+                DeviceAddressBindingReportUnbindEvent(pAllocationData);
             }
 
             m_allocationHashMap.Erase(pGpuMemoryData->pGpuMemory);
@@ -197,10 +218,10 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
         Util::RWLockAuto<RWLock::ReadWrite> lock(&m_palSubAllocationHashMapLock);
         auto*const pGpuMemoryData = reinterpret_cast<Pal::Developer::GpuMemoryData*>(pCbData);
 
-        PAL_ASSERT_MSG((pGpuMemoryData->flags.isClient       == 0) && // Pal internal allocation
-                       (pGpuMemoryData->flags.isCmdAllocator == 0) && // Command allocator is suballocated Pal internal
-                       (pGpuMemoryData->flags.buddyAllocated == 1) && // Buddy allocator is suballocated Pal internal
-                       (pGpuMemoryData->flags.isExternal     == 0) && // External memory is handled by vkCreateMemory
+        PAL_ASSERT_MSG((pGpuMemoryData->flags.isClient        == 0)  && // Pal internal allocation
+                       ((pGpuMemoryData->flags.isCmdAllocator == 1)  || // Command allocator, suballocated Pal internal
+                        (pGpuMemoryData->flags.buddyAllocated == 1)) && // Buddy allocator is suballocated Pal internal
+                       (pGpuMemoryData->flags.isExternal      == 0)  && // External memory is handled by vkCreateMemory
                        (pGpuMemoryData->size < pGpuMemoryData->pGpuMemory->Desc().size), // Suballoc should be smaller
                        "The base GPU allocation of this Pal internal suballocation is not as expected.");
 
@@ -217,25 +238,41 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
             // Add the new Pal suballocation if it did not exist already.
             if (exists == false)
             {
-                // Store the Pal suballocation information
-                pSubAllocData->allocationData = *pGpuMemoryData;
-                pSubAllocData->memoryObjectId = GenerateMemoryObjectId();
+                Util::RWLockAuto<RWLock::ReadOnly> deviceHashSetLock(&m_deviceHashSetLock);
+                const Device*   pDevice         = m_deviceHashSet.Begin().Get()->key;
+                PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex);
+                uint32_t        heapIndex       = 0;
+                bool            validHeap       = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap,
+                                                                                             &heapIndex);
+                VK_ASSERT(validHeap);
 
-                m_callbacksLock.LockForRead();
-                auto iter = m_callbacks.Begin();
-
-                if (iter.IsValid())
+                // Store the Pal suballocation information
+                pSubAllocData->allocationData               = *pGpuMemoryData;
+                pSubAllocData->memoryObjectId               = GenerateMemoryObjectId();
+                pSubAllocData->heapIndex                    = heapIndex;
+                pSubAllocData->offset                       = pGpuMemoryData->offset;
+                pSubAllocData->subAllocationSize            = pGpuMemoryData->size;
+                pSubAllocData->objectHandle                 = NullHandle;
+                pSubAllocData->objectType                   = VK_OBJECT_TYPE_UNKNOWN;
+                pSubAllocData->reportedToDeviceMemoryReport = false;
+
+                // Defer reporting of Pal buddy allocated suballocations to device_memory_report to
+                // ReportDeferredPalSubAlloc() but report CmdAllocator suballocations now
+                if (pGpuMemoryData->flags.isCmdAllocator)
                 {
-                    uint32_t heapIndex = 0;
-                    auto*const pPhysicalDevice = (iter.Get().pDevice)->VkPhysicalDevice(DefaultDeviceIndex);
-                    bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pGpuMemoryData->heap, &heapIndex);
-                    VK_ASSERT(validHeap);
-
-                    pSubAllocData->heapIndex = heapIndex;
+                    auto* const pHandle                         = ApiPhysicalDevice::FromObject(pPhysicalDevice);
+                    pSubAllocData->objectHandle                 = ApiPhysicalDevice::IntValueFromHandle(pHandle);
+                    pSubAllocData->objectType                   = VK_OBJECT_TYPE_PHYSICAL_DEVICE;
+                    pSubAllocData->reportedToDeviceMemoryReport = true;
+
+                     DeviceMemoryReportAllocateEvent(
+                        pSubAllocData->objectHandle,
+                        pSubAllocData->allocationData.size,
+                        pSubAllocData->objectType,
+                        pSubAllocData->memoryObjectId,
+                        pSubAllocData->heapIndex,
+                        pSubAllocData->allocationData.flags.isExternal);
                 }
-                m_callbacksLock.UnlockForRead();
-
-                // Defer reporting of Pal suballocations to device_memory_report to ReportDeferredPalSubAlloc()
             }
             else
             {
@@ -265,11 +302,17 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
                     pSubAllocData->memoryObjectId,
                     pSubAllocData->allocationData.flags.isExternal);
             }
-            else
+            else if (pSubAllocData->allocationData.flags.isCmdAllocator == 1)
             {
-                //PAL_ALERT_ALWAYS_MSG("SubFree of a Pal suballocation that was never reported to device_memory_report");
+                PAL_ALERT_ALWAYS_MSG("SubFree: CmdAllocator suballoc was never reported to device_memory_report");
+            }
+            else if (pSubAllocData->allocationData.flags.buddyAllocated == 1)
+            {
+                PAL_ALERT_ALWAYS_MSG("SubFree: Buddy Allocated suballoc was never reported to device_memory_report");
             }
 
+            DeviceAddressBindingReportUnbindEvent(pSubAllocData);
+
             m_palSubAllocationHashMap.Erase(key);
         }
         else
@@ -281,6 +324,38 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
     }
     case Pal::Developer::CallbackType::BindGpuMemory:
     {
+        auto* const pBindGpuMemoryData = reinterpret_cast<Pal::Developer::BindGpuMemoryData*>(pCbData);
+
+        if (pBindGpuMemoryData->isSystemMemory == false)
+        {
+            Util::MutexAuto lock(&m_bindHashMapMutex);
+
+            bool          exists        = false;
+            BindDataList* pBindDataList = nullptr;
+
+            Pal::Result palResult = m_bindHashMap.FindAllocate(pBindGpuMemoryData->pGpuMemory, &exists, &pBindDataList);
+
+            if (palResult == Pal::Result::Success)
+            {
+                if (exists == false)
+                {
+                    pBindDataList = VK_PLACEMENT_NEW(pBindDataList) BindDataList();
+                }
+
+                BindDataListNode* pBindDataListNode = nullptr;
+                BindDataListNode::Create(m_pInstance, pBindGpuMemoryData, &pBindDataListNode);
+
+                if (pBindDataListNode != nullptr)
+                {
+                    DeviceAddressBindingReportUnbindEvent(pBindDataListNode->GetData());
+
+                    pBindDataList->PushFront(pBindDataListNode->GetNode());
+
+                    DeviceAddressBindingReportBindEvent(pBindDataListNode->GetData());
+                }
+            }
+        }
+
         break;
     }
     default:
@@ -289,18 +364,30 @@ void GpuMemoryEventHandler::PalDeveloperCallback(
 }
 
 // =====================================================================================================================
-// GpuMemoryEventHandler is requested by VK_EXT_device_memory_report and/or VK_EXT_device_address_binding_report.
-// Increment the reference count of requests for GPU memory events.
-void GpuMemoryEventHandler::EnableGpuMemoryEvents()
+// GpuMemoryEventHandler events are required for VK_EXT_device_memory_report and VK_EXT_device_address_binding_report.
+// Increment the count of devices when one or more of these extensions enabled.
+void GpuMemoryEventHandler::EnableGpuMemoryEvents(
+    const Device* pDevice)
 {
-    Util::AtomicIncrement(&m_memoryEventEnables);
+    Util::RWLockAuto<RWLock::ReadWrite> lock(&m_deviceHashSetLock);
+    VK_ASSERT(m_deviceHashSet.Contains(pDevice) == false);
+
+    Util::AtomicIncrement(&m_deviceCount);
+
+    m_deviceHashSet.Insert(pDevice);
 }
 
 // =====================================================================================================================
-// Decrement the reference count of requests for GPU memory events.
-void GpuMemoryEventHandler::DisableGpuMemoryEvents()
+// Decrement the count of devices to remove a device with one or more extensions enabled requiring GPU memory events.
+void GpuMemoryEventHandler::DisableGpuMemoryEvents(
+    const Device* pDevice)
 {
-    Util::AtomicDecrement(&m_memoryEventEnables);
+    Util::RWLockAuto<RWLock::ReadWrite> lock(&m_deviceHashSetLock);
+    VK_ASSERT(m_deviceHashSet.Contains(pDevice));
+
+    Util::AtomicDecrement(&m_deviceCount);
+
+    m_deviceHashSet.Erase(pDevice);
 }
 
 // =====================================================================================================================
@@ -328,36 +415,46 @@ void GpuMemoryEventHandler::UnregisterDeviceMemoryReportCallbacks(
 
 // =====================================================================================================================
 void GpuMemoryEventHandler::VulkanAllocateEvent(
+    const Device*                    pDevice,
     const Pal::IGpuMemory*           pGpuMemory,
     uint64_t                         objectHandle,
     VkObjectType                     objectType,
     uint64_t                         heapIndex)
 {
-    Util::RWLockAuto<RWLock::ReadWrite> lock(&m_allocationHashMapLock);
+    Util::RWLockAuto<RWLock::ReadOnly> lock(&m_allocationHashMapLock);
     AllocationData* pAllocationData = m_allocationHashMap.FindKey(pGpuMemory);
 
     if (pAllocationData != nullptr)
     {
-        if (pAllocationData->reportedToDeviceMemoryReport == false)
+        pAllocationData->objectType                   = objectType;
+        pAllocationData->objectHandle                 = objectHandle;
+        pAllocationData->allocationData.pGpuMemory    = pGpuMemory;
+
+        if (pDevice->GetEnabledFeatures().deviceMemoryReport)
         {
-            pAllocationData->reportedToDeviceMemoryReport = true;
-            pAllocationData->objectType                   = objectType;
-            pAllocationData->objectHandle                 = objectHandle;
-            pAllocationData->allocationData.pGpuMemory    = pGpuMemory;
-
-            const auto& gpuMemoryDesc = pAllocationData->allocationData.pGpuMemory->Desc();
-
-            DeviceMemoryReportAllocateEvent(
-                pAllocationData->objectHandle,
-                gpuMemoryDesc.size,
-                pAllocationData->objectType,
-                gpuMemoryDesc.uniqueId,
-                heapIndex,
-                gpuMemoryDesc.flags.isExternal);
+            if (pAllocationData->reportedToDeviceMemoryReport == false)
+            {
+                pAllocationData->reportedToDeviceMemoryReport = true;
+
+                const auto& gpuMemoryDesc = pAllocationData->allocationData.pGpuMemory->Desc();
+
+                DeviceMemoryReportAllocateEvent(
+                    pAllocationData->objectHandle,
+                    gpuMemoryDesc.size,
+                    pAllocationData->objectType,
+                    gpuMemoryDesc.uniqueId,
+                    heapIndex,
+                    gpuMemoryDesc.flags.isExternal);
+            }
+            else
+            {
+                PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the allocation of an already reported allocation.");
+            }
         }
-        else
+
+        if (pDevice->GetEnabledFeatures().deviceAddressBindingReport)
         {
-            PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the allocation of an already reported allocation.");
+            DeviceAddressBindingReportBindEvent(pAllocationData);
         }
     }
     else
@@ -368,45 +465,20 @@ void GpuMemoryEventHandler::VulkanAllocateEvent(
 
 // =====================================================================================================================
 void GpuMemoryEventHandler::VulkanAllocationFailedEvent(
+    const Device*                    pDevice,
     Pal::gpusize                     allocatedSize,
     VkObjectType                     objectType,
     uint64_t                         heapIndex)
 {
-    DeviceMemoryReportAllocationFailedEvent(allocatedSize, objectType, heapIndex);
-}
-
-// =====================================================================================================================
-void GpuMemoryEventHandler::VulkanFreeEvent(
-    const Pal::IGpuMemory*           pGpuMemory)
-{
-    Util::RWLockAuto<RWLock::ReadWrite> lock(&m_allocationHashMapLock);
-    AllocationData* pAllocationData = m_allocationHashMap.FindKey(pGpuMemory);
-
-    if (pAllocationData != nullptr)
+    if (pDevice->GetEnabledFeatures().deviceMemoryReport)
     {
-        if (pAllocationData->reportedToDeviceMemoryReport == true)
-        {
-            const auto& gpuMemoryDesc = pAllocationData->allocationData.pGpuMemory->Desc();
-
-            DeviceMemoryReportFreeEvent(
-                pAllocationData->objectHandle,
-                pAllocationData->objectType,
-                gpuMemoryDesc.uniqueId,
-                gpuMemoryDesc.flags.isExternal);
-        }
-        else
-        {
-            PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an unreported allocation.");
-        }
-    }
-    else
-    {
-        PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an untracked allocation.");
+        DeviceMemoryReportAllocationFailedEvent(allocatedSize, objectType, heapIndex);
     }
 }
 
 // =====================================================================================================================
 void GpuMemoryEventHandler::VulkanSubAllocateEvent(
+    const Device*                    pDevice,
     const Pal::IGpuMemory*           pGpuMemory,
     Pal::gpusize                     offset,
     Pal::gpusize                     subAllocationSize,
@@ -427,7 +499,6 @@ void GpuMemoryEventHandler::VulkanSubAllocateEvent(
     {
         if (exists == false)
         {
-            pSubAllocData->reportedToDeviceMemoryReport = true;
             pSubAllocData->allocationData.pGpuMemory    = pGpuMemory;
             pSubAllocData->memoryObjectId               = GenerateMemoryObjectId();
             pSubAllocData->objectType                   = objectType;
@@ -436,13 +507,23 @@ void GpuMemoryEventHandler::VulkanSubAllocateEvent(
             pSubAllocData->objectHandle                 = objectHandle;
             pSubAllocData->heapIndex                    = heapIndex;
 
-            DeviceMemoryReportAllocateEvent(
-                pSubAllocData->objectHandle,
-                pSubAllocData->subAllocationSize,
-                pSubAllocData->objectType,
-                pSubAllocData->memoryObjectId,
-                pSubAllocData->heapIndex,
-                pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal);
+            if (pDevice->GetEnabledFeatures().deviceMemoryReport)
+            {
+                pSubAllocData->reportedToDeviceMemoryReport = true;
+
+                DeviceMemoryReportAllocateEvent(
+                    pSubAllocData->objectHandle,
+                    pSubAllocData->subAllocationSize,
+                    pSubAllocData->objectType,
+                    pSubAllocData->memoryObjectId,
+                    pSubAllocData->heapIndex,
+                    pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal);
+            }
+
+            if (pDevice->GetEnabledFeatures().deviceAddressBindingReport)
+            {
+                DeviceAddressBindingReportBindEvent(pSubAllocData);
+            }
         }
         else
         {
@@ -453,6 +534,7 @@ void GpuMemoryEventHandler::VulkanSubAllocateEvent(
 
 // =====================================================================================================================
 void GpuMemoryEventHandler::VulkanSubFreeEvent(
+    const Device*                    pDevice,
     const Pal::IGpuMemory*           pGpuMemory,
     Pal::gpusize                     offset)
 {
@@ -464,17 +546,25 @@ void GpuMemoryEventHandler::VulkanSubFreeEvent(
 
     if (pSubAllocData != nullptr)
     {
-        if (pSubAllocData->reportedToDeviceMemoryReport == true)
+        if (pDevice->GetEnabledFeatures().deviceMemoryReport)
         {
-            DeviceMemoryReportFreeEvent(
-                pSubAllocData->objectHandle,
-                pSubAllocData->objectType,
-                pSubAllocData->memoryObjectId,
-                pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal);
+            if (pSubAllocData->reportedToDeviceMemoryReport)
+            {
+                DeviceMemoryReportFreeEvent(
+                    pSubAllocData->objectHandle,
+                    pSubAllocData->objectType,
+                    pSubAllocData->memoryObjectId,
+                    pSubAllocData->allocationData.pGpuMemory->Desc().flags.isExternal);
+            }
+            else
+            {
+                PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an unreported Vulkan suballocation.");
+            }
         }
-        else
+
+        if (pDevice->GetEnabledFeatures().deviceAddressBindingReport)
         {
-            PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the free of an unreported Vulkan suballocation.");
+            DeviceAddressBindingReportUnbindEvent(pSubAllocData);
         }
 
         m_vulkanSubAllocationHashMap.Erase(key);
@@ -575,12 +665,13 @@ void GpuMemoryEventHandler::DeviceMemoryReportFreeEvent(
 
 // =====================================================================================================================
 void GpuMemoryEventHandler::ReportDeferredPalSubAlloc(
+    const Device*                    pDevice,
     Pal::gpusize                     gpuVirtAddr,
     Pal::gpusize                     offset,
     const uint64_t                   objectHandle,
     const VkObjectType               objectType)
 {
-    Util::RWLockAuto<RWLock::ReadWrite> lock(&m_palSubAllocationHashMapLock);
+    Util::RWLockAuto<RWLock::ReadOnly> lock(&m_palSubAllocationHashMapLock);
 
     SubAllocationKey key = {gpuVirtAddr,
                             offset};
@@ -589,24 +680,28 @@ void GpuMemoryEventHandler::ReportDeferredPalSubAlloc(
 
     if (pSubAllocData != nullptr)
     {
-        if (pSubAllocData->reportedToDeviceMemoryReport == false)
-        {
-            // Report deferred Pal suballocation to device_memory_report now
-            pSubAllocData->objectHandle                 = objectHandle;
-            pSubAllocData->objectType                   = objectType;
-            pSubAllocData->reportedToDeviceMemoryReport = true;
-
-            DeviceMemoryReportAllocateEvent(
-                pSubAllocData->objectHandle,
-                pSubAllocData->allocationData.size,
-                pSubAllocData->objectType,
-                pSubAllocData->memoryObjectId,
-                pSubAllocData->heapIndex,
-                pSubAllocData->allocationData.flags.isExternal);
-        }
-        else
+        pSubAllocData->objectHandle = objectHandle;
+        pSubAllocData->objectType   = objectType;
+
+        if (pDevice->GetEnabledFeatures().deviceMemoryReport)
         {
-            PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report the allocation of an already reported Pal suballocation.");
+            if (pSubAllocData->reportedToDeviceMemoryReport == false)
+            {
+                // Report deferred Pal suballocation to device_memory_report now
+                pSubAllocData->reportedToDeviceMemoryReport = true;
+
+                DeviceMemoryReportAllocateEvent(
+                    pSubAllocData->objectHandle,
+                    pSubAllocData->allocationData.size,
+                    pSubAllocData->objectType,
+                    pSubAllocData->memoryObjectId,
+                    pSubAllocData->heapIndex,
+                    pSubAllocData->allocationData.flags.isExternal);
+            }
+            else
+            {
+                PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report allocation of an already reported Pal suballoc.");
+            }
         }
     }
     else
@@ -619,7 +714,7 @@ void GpuMemoryEventHandler::ReportDeferredPalSubAlloc(
 void GpuMemoryEventHandler::SendDeviceMemoryReportEvent(
     const VkDeviceMemoryReportCallbackDataEXT& callbackData)
 {
-    Util::RWLockAuto<RWLock::ReadWrite> lock(&m_callbacksLock);
+    Util::RWLockAuto<RWLock::ReadOnly> lock(&m_callbacksLock);
 
     for (auto iter = m_callbacks.Begin(); iter.IsValid(); iter.Next())
     {
@@ -627,4 +722,377 @@ void GpuMemoryEventHandler::SendDeviceMemoryReportEvent(
     }
 }
 
+// =====================================================================================================================
+// Creates the BindDataListNode class.
+void GpuMemoryEventHandler::BindDataListNode::Create(
+    Instance*                                 pInstance,
+    Pal::Developer::BindGpuMemoryData*        pBindGpuMemoryData,
+    GpuMemoryEventHandler::BindDataListNode** ppObject)
+{
+    void* pSystemMem = pInstance->AllocMem(
+        sizeof(GpuMemoryEventHandler::BindDataListNode),
+        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+    if (pSystemMem != nullptr)
+    {
+        BindDataListNode* pBindDataListNode =
+            VK_PLACEMENT_NEW(pSystemMem) BindDataListNode(pInstance, pBindGpuMemoryData);
+
+        *ppObject = pBindDataListNode;
+    }
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::BindDataListNode::Destroy()
+{
+    Util::Destructor(this);
+
+    m_pInstance->FreeMem(this);
+}
+
+// =====================================================================================================================
+GpuMemoryEventHandler::BindDataListNode::BindDataListNode(
+    Instance*                          pInstance,
+    Pal::Developer::BindGpuMemoryData* pBindGpuMemoryData)
+    : m_pInstance(pInstance)
+    , m_node(this)
+{
+    m_data.bindGpuMemoryData                    = *pBindGpuMemoryData;
+    m_data.objectHandle                         = NullHandle;
+    m_data.objectType                           = VK_OBJECT_TYPE_UNKNOWN;
+    m_data.reportedToDeviceAddressBindingReport = false;
+}
+
+// =====================================================================================================================
+bool GpuMemoryEventHandler::CheckIntervalsIntersect(
+    const Interval& intervalOne,
+    const Interval& intervalTwo
+    ) const
+{
+    bool intersect = false;
+
+    if (intervalOne.m_offset < intervalTwo.m_offset)
+    {
+        intersect = intervalTwo.m_offset <= (intervalOne.m_offset + intervalOne.m_size);
+    }
+    else
+    {
+        intersect = intervalOne.m_offset <= (intervalTwo.m_offset + intervalTwo.m_size);
+    }
+
+    return intersect;
+}
+
+// =====================================================================================================================
+// The caller of this function must hold the m_bindHashMapMutex mutex
+void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent(
+    const Pal::IGpuMemory* pGpuMemory,
+    const Interval&        interval)
+{
+    BindDataList* pBindDataList = m_bindHashMap.FindKey(pGpuMemory);
+
+    if (pBindDataList != nullptr)
+    {
+        auto iter = pBindDataList->Begin();
+        while(iter.IsValid())
+        {
+            BindData* pBindData = iter.Get()->GetData();
+            bool intersect      = true;
+
+            if (interval.m_size > 0)
+            {
+                intersect = CheckIntervalsIntersect(
+                    Interval(pBindData->bindGpuMemoryData.offset, pBindData->bindGpuMemoryData.requiredGpuMemSize),
+                    interval);
+            }
+
+            if (intersect)
+            {
+                BindDataListNode* pBindDataListNode = iter.Get();
+                if (pBindData->reportedToDeviceAddressBindingReport)
+                {
+                    ReportUnbindEvent(pBindData);
+                }
+                else
+                {
+                    PAL_ALERT_ALWAYS_MSG("Trying to report unbind, but bind was not reported previously.");
+                }
+
+                pBindDataList->Erase(&iter);
+                pBindDataListNode->Destroy();
+            }
+            else
+            {
+                iter.Next();
+            }
+        }
+    }
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::DeviceAddressBindingReportBindEvent(
+    const AllocationData* pAllocationData)
+{
+    Util::MutexAuto lock(&m_bindHashMapMutex);
+    BindDataList* pBindDataList = m_bindHashMap.FindKey(pAllocationData->allocationData.pGpuMemory);
+
+    if (pBindDataList != nullptr)
+    {
+        for (auto iter = pBindDataList->Begin(); iter.IsValid(); iter.Next())
+        {
+            ReportBindEvent(iter.Get()->GetData(), pAllocationData->objectHandle, pAllocationData->objectType);
+        }
+    }
+}
+// =====================================================================================================================
+void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent(
+    const AllocationData* pAllocationData)
+{
+    Util::MutexAuto lock(&m_bindHashMapMutex);
+    DeviceAddressBindingReportUnbindEvent(pAllocationData->allocationData.pGpuMemory, Interval());
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::DeviceAddressBindingReportBindEvent(
+    const SubAllocationData* pSubAllocData)
+{
+    Util::MutexAuto lock(&m_bindHashMapMutex);
+    BindDataList* pBindDataList = m_bindHashMap.FindKey(pSubAllocData->allocationData.pGpuMemory);
+
+    if (pBindDataList != nullptr)
+    {
+        for (auto iter = pBindDataList->Begin(); iter.IsValid(); iter.Next())
+        {
+            BindData* pBindData = iter.Get()->GetData();
+
+            bool intersect = CheckIntervalsIntersect(
+                Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize),
+                Interval(pBindData->bindGpuMemoryData.offset, pBindData->bindGpuMemoryData.requiredGpuMemSize));
+
+            if (intersect)
+            {
+                ReportBindEvent(pBindData, pSubAllocData->objectHandle, pSubAllocData->objectType);
+            }
+        }
+    }
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent(
+    const SubAllocationData* pSubAllocData)
+{
+    Util::MutexAuto lock(&m_bindHashMapMutex);
+    DeviceAddressBindingReportUnbindEvent(
+        pSubAllocData->allocationData.pGpuMemory,
+        Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize));
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::DeviceAddressBindingReportBindEvent(
+    BindData* pNewBindData)
+{
+    // The caller of this function must hold the m_bindHashMapMutex mutex
+    m_allocationHashMapLock.LockForRead();
+
+    const AllocationData* pAllocationData = m_allocationHashMap.FindKey(pNewBindData->bindGpuMemoryData.pGpuMemory);
+
+    if (pAllocationData != nullptr)
+    {
+        if (pAllocationData->objectHandle != NullHandle)
+        {
+            ReportBindEvent(pNewBindData, pAllocationData->objectHandle, pAllocationData->objectType);
+        }
+        else if ((pAllocationData->allocationData.flags.isClient        == 0) &&
+                 (pAllocationData->allocationData.flags.buddyAllocated  == 0) &&
+                 (pAllocationData->allocationData.flags.isCmdAllocator  == 0) &&
+                 (pAllocationData->allocationData.flags.isExternal      == 0))
+        {
+            const Device*   pDevice         = m_deviceHashSet.Begin().Get()->key;
+            PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex);
+            auto* const     pHandle         = ApiPhysicalDevice::FromObject(pPhysicalDevice);
+
+            // Pal internal allocation; attribute this to the physical device
+            ReportBindEvent(
+                pNewBindData,
+                ApiPhysicalDevice::IntValueFromHandle(pHandle),
+                VK_OBJECT_TYPE_PHYSICAL_DEVICE);
+        }
+    }
+    m_allocationHashMapLock.UnlockForRead();
+
+    if (pNewBindData->reportedToDeviceAddressBindingReport == false)
+    {
+        m_palSubAllocationHashMapLock.LockForRead();
+        for (auto iter = m_palSubAllocationHashMap.Begin(); iter.Get() != nullptr; iter.Next())
+        {
+            const SubAllocationData* pSubAllocData = &iter.Get()->value;
+            if (pSubAllocData->allocationData.pGpuMemory != pNewBindData->bindGpuMemoryData.pGpuMemory)
+            {
+                continue;
+            }
+
+            bool intersect = CheckIntervalsIntersect(
+                Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize),
+                Interval(pNewBindData->bindGpuMemoryData.offset, pNewBindData->bindGpuMemoryData.requiredGpuMemSize));
+
+            if (intersect)
+            {
+                if (pSubAllocData->objectHandle != NullHandle)
+                {
+                    ReportBindEvent(pNewBindData, pSubAllocData->objectHandle, pSubAllocData->objectType);
+                }
+                else if ((pSubAllocData->allocationData.flags.isClient       == 0)  &&
+                        ((pSubAllocData->allocationData.flags.buddyAllocated == 1)  ||
+                         (pSubAllocData->allocationData.flags.isCmdAllocator == 1)) &&
+                         (pSubAllocData->allocationData.flags.isExternal     == 0))
+                {
+                    const Device*   pDevice         = m_deviceHashSet.Begin().Get()->key;
+                    PhysicalDevice* pPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex);
+                    auto* const     pHandle         = ApiPhysicalDevice::FromObject(pPhysicalDevice);
+
+                    // Pal internal allocation; attribute this to the physical device
+                    ReportBindEvent(
+                        pNewBindData,
+                        ApiPhysicalDevice::IntValueFromHandle(pHandle),
+                        VK_OBJECT_TYPE_PHYSICAL_DEVICE);
+                }
+                break;
+            }
+        }
+        m_palSubAllocationHashMapLock.UnlockForRead();
+    }
+
+    if (pNewBindData->reportedToDeviceAddressBindingReport == false)
+    {
+        m_vulkanSubAllocationHashMapLock.LockForRead();
+        for (auto iter = m_vulkanSubAllocationHashMap.Begin(); iter.Get() != nullptr; iter.Next())
+        {
+            const SubAllocationData* pSubAllocData = &iter.Get()->value;
+            if (pSubAllocData->allocationData.pGpuMemory != pNewBindData->bindGpuMemoryData.pGpuMemory)
+            {
+                continue;
+            }
+
+            if (pSubAllocData->objectHandle == NullHandle)
+            {
+                continue;
+            }
+
+            bool intersect = CheckIntervalsIntersect(
+                Interval(pSubAllocData->offset, pSubAllocData->subAllocationSize),
+                Interval(pNewBindData->bindGpuMemoryData.offset, pNewBindData->bindGpuMemoryData.requiredGpuMemSize));
+
+            if (intersect)
+            {
+                ReportBindEvent(pNewBindData, pSubAllocData->objectHandle, pSubAllocData->objectType);
+                break;
+            }
+        }
+        m_vulkanSubAllocationHashMapLock.UnlockForRead();
+    }
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::DeviceAddressBindingReportUnbindEvent(
+    BindData* pNewBindData)
+{
+    // The caller of this function must hold the m_bindHashMapMutex mutex
+    DeviceAddressBindingReportUnbindEvent(
+        pNewBindData->bindGpuMemoryData.pGpuMemory,
+        Interval(pNewBindData->bindGpuMemoryData.offset, pNewBindData->bindGpuMemoryData.requiredGpuMemSize));
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::ReportBindEvent(
+    BindData*                       pBindData,
+    uint64_t                        objectHandle,
+    VkObjectType                    objectType)
+{
+    if (pBindData->reportedToDeviceAddressBindingReport == false)
+    {
+        pBindData->objectHandle                         = objectHandle;
+        pBindData->objectType                           = objectType;
+        pBindData->reportedToDeviceAddressBindingReport = true;
+
+        DeviceAddressBindingReportCallback(
+            pBindData->objectHandle,
+            pBindData->objectType,
+            VK_DEVICE_ADDRESS_BINDING_TYPE_BIND_EXT,
+            pBindData->bindGpuMemoryData.pGpuMemory->Desc().gpuVirtAddr + pBindData->bindGpuMemoryData.offset,
+            pBindData->bindGpuMemoryData.requiredGpuMemSize,
+            pBindData->objectHandle == NullHandle);
+    }
+    else
+    {
+        PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report an already reported bind");
+    }
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::ReportUnbindEvent(
+    BindData*                       pBindData)
+{
+    if (pBindData->reportedToDeviceAddressBindingReport)
+    {
+        DeviceAddressBindingReportCallback(
+            pBindData->objectHandle,
+            pBindData->objectType,
+            VK_DEVICE_ADDRESS_BINDING_TYPE_UNBIND_EXT,
+            pBindData->bindGpuMemoryData.pGpuMemory->Desc().gpuVirtAddr + pBindData->bindGpuMemoryData.offset,
+            pBindData->bindGpuMemoryData.requiredGpuMemSize,
+            pBindData->objectHandle == NullHandle);
+    }
+    else
+    {
+        PAL_ALERT_ALWAYS_MSG("Vulkan is trying to report unbind of an unreported Vulkan bind.");
+    }
+}
+
+// =====================================================================================================================
+void GpuMemoryEventHandler::DeviceAddressBindingReportCallback(
+    uint64_t                        objectHandle,
+    VkObjectType                    objectType,
+    VkDeviceAddressBindingTypeEXT   bindingType,
+    VkDeviceAddress                 bindingAddress,
+    VkDeviceSize                    allocatedSize,
+    bool                            isInternal)
+{
+    VkDeviceAddressBindingCallbackDataEXT bindingCallbackData = {};
+
+    bindingCallbackData.sType       = VK_STRUCTURE_TYPE_DEVICE_ADDRESS_BINDING_CALLBACK_DATA_EXT;
+    bindingCallbackData.pNext       = nullptr;
+    bindingCallbackData.flags       = isInternal ? VK_DEVICE_ADDRESS_BINDING_INTERNAL_OBJECT_BIT_EXT : 0;
+    bindingCallbackData.baseAddress = bindingAddress;
+    bindingCallbackData.size        = allocatedSize;
+    bindingCallbackData.bindingType = bindingType;
+
+    VkDebugUtilsObjectNameInfoEXT objectNameInfo = {};
+
+    objectNameInfo.sType            = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT;
+    objectNameInfo.pNext            = nullptr;
+    objectNameInfo.objectType       = objectType;
+    objectNameInfo.objectHandle     = objectHandle;
+    objectNameInfo.pObjectName      = nullptr;
+
+    VkDebugUtilsMessengerCallbackDataEXT callbackData = {};
+
+    callbackData.sType              = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CALLBACK_DATA_EXT;
+    callbackData.pNext              = &bindingCallbackData;
+    callbackData.flags              = 0; // reserved for future use
+    callbackData.pMessageIdName     = nullptr;
+    callbackData.messageIdNumber    = 0;
+    callbackData.pMessage           = nullptr;
+    callbackData.queueLabelCount    = 0;
+    callbackData.pQueueLabels       = nullptr;
+    callbackData.cmdBufLabelCount   = 0;
+    callbackData.pCmdBufLabels      = nullptr;
+    callbackData.objectCount        = 1;
+    callbackData.pObjects           = &objectNameInfo;
+
+    m_pInstance->CallExternalMessengers(
+        VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT,
+        VK_DEBUG_UTILS_MESSAGE_TYPE_DEVICE_ADDRESS_BINDING_BIT_EXT,
+        &callbackData);
+}
+
 } // namespace vk
diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp
index d3ed09d0..a477e89c 100644
--- a/icd/api/graphics_pipeline_common.cpp
+++ b/icd/api/graphics_pipeline_common.cpp
@@ -293,8 +293,6 @@ void GraphicsPipelineCommon::GetSubpassSampleCount(
     // subpassCoverageSampleCount would be equal to zero if there are zero attachments.
     coverageSampleCount = (coverageSampleCount == 0) ? rasterizationSampleCount : coverageSampleCount;
 
-    VK_ASSERT(rasterizationSampleCount == coverageSampleCount);
-
     if (pCoverageSampleCount != nullptr)
     {
         *pCoverageSampleCount = coverageSampleCount;
@@ -1012,9 +1010,6 @@ static void BuildRasterizationState(
             pInfo->staticStateMask |= 1ULL << static_cast<uint32_t>(DynamicStatesInternal::DepthBias);
         }
 
-        // point size must be set via gl_PointSize, otherwise it must be 1.0f.
-        constexpr float DefaultPointSize = 1.0f;
-
         pInfo->immedInfo.pointLineRasterParams.lineWidth    = pRs->lineWidth;
         pInfo->immedInfo.pointLineRasterParams.pointSize    = DefaultPointSize;
         pInfo->immedInfo.pointLineRasterParams.pointSizeMin = limits.pointSizeRange[0];
@@ -1397,12 +1392,13 @@ static void BuildMultisampleState(
             pInfo->immedInfo.msaaCreateInfo.flags.enable1xMsaaSampleLocations =
                 (pInfo->immedInfo.msaaCreateInfo.coverageSamples == 1);
 
-            if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::SampleLocations) == false)
+            if ((IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::SampleLocations) == false) &&
+                (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::RasterizationSamples) == false))
             {
                 if (pPipelineSampleLocationsStateCreateInfoEXT != nullptr)
                 {
                     // We store the custom sample locations if custom sample locations are enabled and the
-                    // sample locations state is static.
+                    // sample locations state is static and rasterizationSamples is not configured dynamically.
                     pInfo->immedInfo.samplePattern.sampleCount =
                         (uint32_t)pPipelineSampleLocationsStateCreateInfoEXT->sampleLocationsInfo.sampleLocationsPerPixel;
 
@@ -1424,9 +1420,10 @@ static void BuildMultisampleState(
                     (1ULL << static_cast<uint32_t>(DynamicStatesInternal::SampleLocations));
             }
         }
-        else
+        else if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::RasterizationSamples) == false)
         {
-            // We store the standard sample locations if custom sample locations are not enabled.
+            // We store the standard sample locations if custom sample locations are not enabled and
+            // rasterizationSamples is not configured dynamically.
             pInfo->immedInfo.samplePattern.sampleCount = pMs->rasterizationSamples;
             pInfo->immedInfo.samplePattern.locations =
                 *Device::GetDefaultQuadSamplePattern(pMs->rasterizationSamples);
diff --git a/icd/api/include/app_profile.h b/icd/api/include/app_profile.h
index 213d11c5..764fb497 100644
--- a/icd/api/include/app_profile.h
+++ b/icd/api/include/app_profile.h
@@ -69,6 +69,7 @@ enum class AppProfile : uint32_t
     DawnOfWarIII,            // Dawn of War III by Feral3D
     WarHammerII,             // Total War: WarHammer II by Feral3D
     WarHammerIII,            // Total War: WarHammer III by Feral3D
+    RomeRemastered,          // Total War Rome Remastered
     AshesOfTheSingularity,   // Ashes Of The Singularity
     StrangeBrigade,          // Strange Brigade
     WorldWarZ,               // WorldWarZ
@@ -129,6 +130,7 @@ enum class AppProfile : uint32_t
     Satisfactory,            // Satisfactory by Coffee Stain Studios
     QuakeEnhanced,           // Quake Enhanced by id Software
     Zink,                    // Zink
+    SOTTR,                   // Shadow of the tomb raider steam version
 };
 
 struct ProfileSettings
diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h
index 159ad80d..40412dee 100644
--- a/icd/api/include/compiler_solution.h
+++ b/icd/api/include/compiler_solution.h
@@ -46,6 +46,7 @@ namespace vk
 
 class PhysicalDevice;
 class PipelineCache;
+class PipelineBinaryCache;
 class ShaderCache;
 class DeferredHostOperation;
 
@@ -64,8 +65,8 @@ enum FreeCompilerBinary : uint32_t
 struct ShaderModuleHandle
 {
     uint32_t* pRefCount;
-    void*            pLlpcShaderModule; // Shader module handle from LLPC
     Vkgc::BinaryData elfPackage;        // Generated ElfPacekage from LLPC
+    void*            pLlpcShaderModule; // Shader module handle from LLPC
 };
 
 // =====================================================================================================================
@@ -143,6 +144,9 @@ struct GraphicsPipelineBinaryCreateInfo
     PipelineCompilerType                   compilerType;
     bool                                   linkTimeOptimization;
     Vkgc::BinaryData                       earlyElfPackage[ShaderStage::ShaderStageGfxCount];
+    Util::MetroHash::Hash                  earlyElfPackageHash[ShaderStage::ShaderStageGfxCount];
+    uint64_t                               apiPsoHash;
+    uint64_t                               libraryHash[ShaderStage::ShaderStageGfxCount];
     FreeCompilerBinary                     freeCompilerBinary;
     PipelineCreationFeedback               pipelineFeedback;
     PipelineCreationFeedback               stageFeedback[ShaderStage::ShaderStageGfxCount];
@@ -167,6 +171,7 @@ struct ComputePipelineBinaryCreateInfo
     PipelineCreationFeedback               pipelineFeedback;
     PipelineCreationFeedback               stageFeedback;
     PipelineMetadata*                      pBinaryMetadata;
+    uint64_t                               apiPsoHash;
 };
 
 #if VKI_RAY_TRACING
@@ -186,6 +191,7 @@ struct RayTracingPipelineBinaryCreateInfo
     uint32_t                               maxAttributeSize;
     bool                                   allowShaderInlining;
     DeferredWorkload*                      pDeferredWorkload;
+    uint64_t                               apiPsoHash;
 };
 
 // =====================================================================================================================
@@ -209,22 +215,14 @@ class CompilerSolution
     CompilerSolution(PhysicalDevice* pPhysicalDevice);
     virtual ~CompilerSolution();
 
-    virtual VkResult Initialize(Vkgc::GfxIpVersion gfxIp, Pal::GfxIpLevel gfxIpLevel, Vkgc::ICache* pCache) = 0;
+    virtual VkResult Initialize(Vkgc::GfxIpVersion gfxIp, Pal::GfxIpLevel gfxIpLevel, PipelineBinaryCache* pCache);
 
     virtual void Destroy() = 0;
 
-    virtual size_t GetShaderCacheSize(PipelineCompilerType cacheType) = 0;
-
-    virtual VkResult CreateShaderCache(
-        const void*  pInitialData,
-        size_t       initialDataSize,
-        void*        pShaderCacheMem,
-        uint32_t     expectedEntries,
-        ShaderCache* pShaderCache) = 0;
-
     virtual VkResult BuildShaderModule(
         const Device*                pDevice,
         VkShaderModuleCreateFlags    flags,
+        VkShaderModuleCreateFlags    internalShaderFlags,
         size_t                       codeSize,
         const void*                  pCode,
         const bool                   adaptForFastLink,
@@ -253,6 +251,7 @@ class CompilerSolution
 
     virtual VkResult CreateGraphicsShaderBinary(
         const Device*                     pDevice,
+        PipelineCache*                    pPipelineCache,
         const ShaderStage                 stage,
         GraphicsPipelineBinaryCreateInfo* pCreateInfo,
         void*                             pPipelineDumpHandle,
diff --git a/icd/api/include/compiler_solution_llpc.h b/icd/api/include/compiler_solution_llpc.h
index 0058968d..189d7126 100644
--- a/icd/api/include/compiler_solution_llpc.h
+++ b/icd/api/include/compiler_solution_llpc.h
@@ -77,22 +77,17 @@ class CompilerSolutionLlpc final : public CompilerSolution
 
 public:
     // Overridden functions
-    virtual VkResult Initialize(Vkgc::GfxIpVersion gfxIp, Pal::GfxIpLevel gfxIpLevel, Vkgc::ICache* pCache) override;
+    virtual VkResult Initialize(
+        Vkgc::GfxIpVersion   gfxIp,
+        Pal::GfxIpLevel      gfxIpLevel,
+        PipelineBinaryCache* pCache) override;
 
     virtual void Destroy() override;
 
-    virtual size_t GetShaderCacheSize(PipelineCompilerType cacheType) override;
-
-    virtual VkResult CreateShaderCache(
-        const void*  pInitialData,
-        size_t       initialDataSize,
-        void*        pShaderCacheMem,
-        uint32_t     expectedEntries,
-        ShaderCache* pShaderCache) override;
-
     virtual VkResult BuildShaderModule(
         const Device*                pDevice,
         VkShaderModuleCreateFlags    flags,
+        VkShaderModuleCreateFlags    internalShaderFlags,
         size_t                       codeSize,
         const void*                  pCode,
         const bool                   adaptForFastLink,
@@ -121,6 +116,7 @@ class CompilerSolutionLlpc final : public CompilerSolution
 
     virtual VkResult CreateGraphicsShaderBinary(
         const Device*                     pDevice,
+        PipelineCache*                    pPipelineCache,
         const ShaderStage                 stage,
         GraphicsPipelineBinaryCreateInfo* pCreateInfo,
         void*                             pPipelineDumpHandle,
diff --git a/icd/api/include/gpumemory_event_handler.h b/icd/api/include/gpumemory_event_handler.h
index 26606875..a1a40588 100644
--- a/icd/api/include/gpumemory_event_handler.h
+++ b/icd/api/include/gpumemory_event_handler.h
@@ -32,7 +32,9 @@
 #include "include/vk_alloccb.h"
 #include "include/vk_utils.h"
 
+#include "palIntrusiveList.h"
 #include "palHashMap.h"
+#include "palHashSet.h"
 #include "palMutex.h"
 #include "palUtil.h"
 #include "palVector.h"
@@ -61,18 +63,20 @@ class GpuMemoryEventHandler
         Pal::Developer::CallbackType type,
         void*                        pCbData);
 
-    void EnableGpuMemoryEvents();
+    void EnableGpuMemoryEvents(
+        const Device* pDevice);
 
-    void DisableGpuMemoryEvents();
+    void DisableGpuMemoryEvents(
+        const Device* pDevice);
 
-    VK_FORCEINLINE bool IsGpuMemoryEventHandlerEnabled() { return m_memoryEventEnables > 0; }
+    VK_FORCEINLINE bool IsGpuMemoryEventHandlerEnabled() { return m_deviceCount > 0; }
 
-    typedef struct
+    struct DeviceMemoryReportCallback
     {
         PFN_vkDeviceMemoryReportCallbackEXT callback;
         void*                               pData;
         const Device*                       pDevice;
-    } DeviceMemoryReportCallback;
+    };
 
     typedef Util::Vector<DeviceMemoryReportCallback, 1, PalAllocator> DeviceMemoryReportCallbacks;
 
@@ -83,20 +87,20 @@ class GpuMemoryEventHandler
         const Device*                       pDevice);
 
     void VulkanAllocateEvent(
+        const Device*                    pDevice,
         const Pal::IGpuMemory*           pGpuMemory,
         uint64_t                         objectHandle,
         VkObjectType                     objectType,
         uint64_t                         heapIndex);
 
     void VulkanAllocationFailedEvent(
+        const Device*                    pDevice,
         Pal::gpusize                     allocatedSize,
         VkObjectType                     objectType,
         uint64_t                         heapIndex);
 
-    void VulkanFreeEvent(
-        const Pal::IGpuMemory*           pGpuMemory);
-
     void VulkanSubAllocateEvent(
+        const Device*                    pDevice,
         const Pal::IGpuMemory*           pGpuMemory,
         Pal::gpusize                     offset,
         Pal::gpusize                     subAllocationSize,
@@ -105,10 +109,12 @@ class GpuMemoryEventHandler
         uint64_t                         heapIndex);
 
     void VulkanSubFreeEvent(
+        const Device*                    pDevice,
         const Pal::IGpuMemory*           pGpuMemory,
         Pal::gpusize                     offset);
 
     void ReportDeferredPalSubAlloc(
+        const Device*                    pDevice,
         Pal::gpusize                     gpuVirtAddr,
         Pal::gpusize                     offset,
         const uint64_t                   objectHandle,
@@ -121,6 +127,77 @@ class GpuMemoryEventHandler
 
     PAL_DISALLOW_COPY_AND_ASSIGN(GpuMemoryEventHandler);
 
+    struct AllocationData
+    {
+        Pal::Developer::GpuMemoryData allocationData;
+        uint64_t                      objectHandle;
+        VkObjectType                  objectType;
+        bool                          reportedToDeviceMemoryReport;
+    };
+
+    struct SubAllocationData
+    {
+        Pal::Developer::GpuMemoryData allocationData;
+        uint64_t                      objectHandle;
+        VkObjectType                  objectType;
+        bool                          reportedToDeviceMemoryReport;
+        uint64_t                      memoryObjectId;
+        Pal::gpusize                  subAllocationSize;
+        Pal::gpusize                  offset;
+        uint64_t                      heapIndex;
+    };
+
+    struct BindData
+    {
+        Pal::Developer::BindGpuMemoryData bindGpuMemoryData;
+        uint64_t                          objectHandle;
+        VkObjectType                      objectType;
+        bool                              reportedToDeviceAddressBindingReport;
+    };
+
+    class BindDataListNode
+    {
+    public:
+        static void Create(
+            Instance*                          pInstance,
+            Pal::Developer::BindGpuMemoryData* pBindGpuMemoryData,
+            BindDataListNode**                 ppObject);
+
+        void Destroy();
+
+        BindData*                                  GetData() { return &m_data; }
+        Util::IntrusiveListNode<BindDataListNode>* GetNode() { return &m_node; }
+
+    private:
+        BindDataListNode(
+            Instance*                          pInstance,
+            Pal::Developer::BindGpuMemoryData* pBindGpuMemoryData);
+
+        Instance*                                 m_pInstance;
+        BindData                                  m_data;
+        Util::IntrusiveListNode<BindDataListNode> m_node;
+
+        PAL_DISALLOW_COPY_AND_ASSIGN(BindDataListNode);
+    };
+
+    struct Interval
+    {
+        Interval()
+            : m_offset(0), m_size(0)
+        {
+        }
+
+        Interval(const Pal::gpusize offset,const Pal::gpusize size)
+            : m_offset(offset), m_size(size)
+        {
+        }
+
+        Pal::gpusize m_offset;
+        Pal::gpusize m_size;
+    };
+
+    static_assert(std::is_standard_layout<Interval>::value);
+
     void HandlePalDeveloperCallback(
         Pal::Developer::CallbackType type,
         void*                        pCbData);
@@ -147,6 +224,49 @@ class GpuMemoryEventHandler
     void SendDeviceMemoryReportEvent(
         const VkDeviceMemoryReportCallbackDataEXT& callbackData);
 
+    // The caller of this function must hold the m_bindHashMapLock lock for read/write
+    void DeviceAddressBindingReportUnbindEvent(
+        const Pal::IGpuMemory*                pGpuMemory,
+        const Interval&                       interval);
+
+    void DeviceAddressBindingReportBindEvent(
+        const AllocationData* pAllocationData);
+
+    void DeviceAddressBindingReportUnbindEvent(
+        const AllocationData* pAllocationData);
+
+    void DeviceAddressBindingReportBindEvent(
+        const SubAllocationData* pSubAllocData);
+
+    void DeviceAddressBindingReportUnbindEvent(
+        const SubAllocationData* pSubAllocData);
+
+    void DeviceAddressBindingReportBindEvent(
+        BindData* pNewBindData);
+
+    void DeviceAddressBindingReportUnbindEvent(
+        BindData* pNewBindData);
+
+    void DeviceAddressBindingReportCallback(
+        uint64_t                        objectHandle,
+        VkObjectType                    objectType,
+        VkDeviceAddressBindingTypeEXT   bindingType,
+        VkDeviceAddress                 bindingAddress,
+        VkDeviceSize                    allocatedSize,
+        bool                            isInternal);
+
+    void ReportBindEvent(
+        BindData*                       pBindData,
+        uint64_t                        objectHandle,
+        VkObjectType                    objectType);
+
+    void ReportUnbindEvent(
+        BindData*                       pBindData);
+
+    bool CheckIntervalsIntersect(
+        const Interval&                 intervalOne,
+        const Interval&                 intervalTwo) const;
+
     // Generates an ID, unique within the instance, for a GPU memory object
     uint64_t GenerateMemoryObjectId() { return Util::AtomicIncrement64(&m_memoryObjectId); }
 
@@ -155,14 +275,6 @@ class GpuMemoryEventHandler
     DeviceMemoryReportCallbacks m_callbacks;
     Util::RWLock                m_callbacksLock;
 
-    typedef struct
-    {
-        Pal::Developer::GpuMemoryData allocationData;
-        uint64_t                      objectHandle;
-        VkObjectType                  objectType;
-        bool                          reportedToDeviceMemoryReport;
-    } AllocationData;
-
     typedef Util::HashMap<const Pal::IGpuMemory*,
                           AllocationData,
                           PalAllocator> GpuMemoryAllocationHashMap;
@@ -170,37 +282,39 @@ class GpuMemoryEventHandler
     GpuMemoryAllocationHashMap m_allocationHashMap;
     Util::RWLock               m_allocationHashMapLock;
 
-    typedef struct
+    struct SubAllocationKey
     {
         Pal::gpusize                  gpuVirtAddr;
         Pal::gpusize                  offset;
-    } SubAllocationKey;
-
-    typedef struct
-    {
-        Pal::Developer::GpuMemoryData allocationData;
-        uint64_t                      objectHandle;
-        VkObjectType                  objectType;
-        bool                          reportedToDeviceMemoryReport;
-        uint64_t                      memoryObjectId;
-        Pal::gpusize                  subAllocationSize;
-        Pal::gpusize                  offset;
-        uint64_t                      heapIndex;
-    } SubAllocationData;
+    };
 
     typedef Util::HashMap<SubAllocationKey,
                           SubAllocationData,
                           PalAllocator,
                           Util::JenkinsHashFunc> GpuMemorySubAllocationHashMap;
 
+    typedef Util::IntrusiveList<BindDataListNode> BindDataList;
+
+    typedef Util::HashMap<const Pal::IGpuMemory*,
+                          BindDataList,
+                          PalAllocator> GpuMemoryBindHashMap;
+
     GpuMemorySubAllocationHashMap m_vulkanSubAllocationHashMap;
     Util::RWLock                  m_vulkanSubAllocationHashMapLock;
 
     GpuMemorySubAllocationHashMap m_palSubAllocationHashMap;
     Util::RWLock                  m_palSubAllocationHashMapLock;
 
+    GpuMemoryBindHashMap          m_bindHashMap;
+    Util::Mutex                   m_bindHashMapMutex;
+
+    typedef Util::HashSet<const Device*, PalAllocator> DeviceHashSet;
+
+    DeviceHashSet                 m_deviceHashSet;
+    Util::RWLock                  m_deviceHashSetLock;
+
     volatile uint64_t m_memoryObjectId;             // Seed for memoryObjectId generation
-    volatile uint32_t m_memoryEventEnables;         // The number of device extensions requesting memory events
+    volatile uint32_t m_deviceCount;                // The number of devices with extensions that require memory events
 };
 
 } // namespace vk
diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h
index c0936964..a2e7ed3c 100644
--- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h
+++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h
@@ -68,7 +68,7 @@ extern "C" {
 #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0
 
 // Version of this file
-#define VK_HEADER_VERSION 246
+#define VK_HEADER_VERSION 250
 
 // Complete version of this file
 #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION)
@@ -1002,6 +1002,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_OPTICAL_FLOW_SESSION_CREATE_PRIVATE_DATA_INFO_NV = 1000464010,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LEGACY_DITHERING_FEATURES_EXT = 1000465000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_PROTECTED_ACCESS_FEATURES_EXT = 1000466000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_POSITION_FETCH_FEATURES_KHR = 1000481000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_FEATURES_EXT = 1000482000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_PROPERTIES_EXT = 1000482001,
     VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT = 1000482002,
@@ -1019,6 +1020,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_LIBRARY_GROUP_HANDLES_FEATURES_EXT = 1000498000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PER_VIEW_RENDER_AREAS_FEATURES_QCOM = 1000510000,
     VK_STRUCTURE_TYPE_MULTIVIEW_PER_VIEW_RENDER_AREAS_RENDER_PASS_BEGIN_INFO_QCOM = 1000510001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_FEATURES_EXT = 1000524000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES,
     VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
@@ -1871,6 +1873,7 @@ typedef enum VkDynamicState {
     VK_DYNAMIC_STATE_SHADING_RATE_IMAGE_ENABLE_NV = 1000455030,
     VK_DYNAMIC_STATE_REPRESENTATIVE_FRAGMENT_TEST_ENABLE_NV = 1000455031,
     VK_DYNAMIC_STATE_COVERAGE_REDUCTION_MODE_NV = 1000455032,
+    VK_DYNAMIC_STATE_ATTACHMENT_FEEDBACK_LOOP_ENABLE_EXT = 1000524000,
     VK_DYNAMIC_STATE_CULL_MODE_EXT = VK_DYNAMIC_STATE_CULL_MODE,
     VK_DYNAMIC_STATE_FRONT_FACE_EXT = VK_DYNAMIC_STATE_FRONT_FACE,
     VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT = VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY,
@@ -9419,11 +9422,11 @@ typedef struct VkVideoDecodeH265SessionParametersCreateInfoKHR {
 } VkVideoDecodeH265SessionParametersCreateInfoKHR;
 
 typedef struct VkVideoDecodeH265PictureInfoKHR {
-    VkStructureType                   sType;
-    const void*                       pNext;
-    StdVideoDecodeH265PictureInfo*    pStdPictureInfo;
-    uint32_t                          sliceSegmentCount;
-    const uint32_t*                   pSliceSegmentOffsets;
+    VkStructureType                         sType;
+    const void*                             pNext;
+    const StdVideoDecodeH265PictureInfo*    pStdPictureInfo;
+    uint32_t                                sliceSegmentCount;
+    const uint32_t*                         pSliceSegmentOffsets;
 } VkVideoDecodeH265PictureInfoKHR;
 
 typedef struct VkVideoDecodeH265DpbSlotInfoKHR {
@@ -10230,6 +10233,17 @@ VKAPI_ATTR void VKAPI_CALL vkGetDeviceImageSparseMemoryRequirementsKHR(
 #endif
 
 
+#define VK_KHR_ray_tracing_position_fetch 1
+#define VK_KHR_RAY_TRACING_POSITION_FETCH_SPEC_VERSION 1
+#define VK_KHR_RAY_TRACING_POSITION_FETCH_EXTENSION_NAME "VK_KHR_ray_tracing_position_fetch"
+typedef struct VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           rayTracingPositionFetch;
+} VkPhysicalDeviceRayTracingPositionFetchFeaturesKHR;
+
+
+
 #define VK_EXT_debug_report 1
 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT)
 #define VK_EXT_DEBUG_REPORT_SPEC_VERSION  10
@@ -12090,6 +12104,7 @@ typedef enum VkBuildAccelerationStructureFlagBitsKHR {
 #ifdef VK_ENABLE_BETA_EXTENSIONS
     VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DISPLACEMENT_MICROMAP_UPDATE_NV = 0x00000200,
 #endif
+    VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_DATA_ACCESS_KHR = 0x00000800,
     VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_NV = VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR,
     VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_NV = VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR,
     VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_NV = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR,
@@ -14695,7 +14710,7 @@ typedef struct VkPhysicalDevice4444FormatsFeaturesEXT {
 
 
 #define VK_EXT_device_fault 1
-#define VK_EXT_DEVICE_FAULT_SPEC_VERSION  1
+#define VK_EXT_DEVICE_FAULT_SPEC_VERSION  2
 #define VK_EXT_DEVICE_FAULT_EXTENSION_NAME "VK_EXT_device_fault"
 
 typedef enum VkDeviceFaultAddressTypeEXT {
@@ -14759,6 +14774,8 @@ typedef struct VkDeviceFaultVendorBinaryHeaderVersionOneEXT {
     uint32_t                                     applicationNameOffset;
     uint32_t                                     applicationVersion;
     uint32_t                                     engineNameOffset;
+    uint32_t                                     engineVersion;
+    uint32_t                                     apiVersion;
 } VkDeviceFaultVendorBinaryHeaderVersionOneEXT;
 
 typedef VkResult (VKAPI_PTR *PFN_vkGetDeviceFaultInfoEXT)(VkDevice device, VkDeviceFaultCountsEXT* pFaultCounts, VkDeviceFaultInfoEXT* pFaultInfo);
@@ -15540,7 +15557,7 @@ VKAPI_ATTR void VKAPI_CALL vkGetMicromapBuildSizesEXT(
 
 
 #define VK_HUAWEI_cluster_culling_shader 1
-#define VK_HUAWEI_CLUSTER_CULLING_SHADER_SPEC_VERSION 1
+#define VK_HUAWEI_CLUSTER_CULLING_SHADER_SPEC_VERSION 2
 #define VK_HUAWEI_CLUSTER_CULLING_SHADER_EXTENSION_NAME "VK_HUAWEI_cluster_culling_shader"
 typedef struct VkPhysicalDeviceClusterCullingShaderFeaturesHUAWEI {
     VkStructureType    sType;
@@ -16667,6 +16684,24 @@ typedef struct VkMultiviewPerViewRenderAreasRenderPassBeginInfoQCOM {
 
 
 
+#define VK_EXT_attachment_feedback_loop_dynamic_state 1
+#define VK_EXT_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_SPEC_VERSION 1
+#define VK_EXT_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_EXTENSION_NAME "VK_EXT_attachment_feedback_loop_dynamic_state"
+typedef struct VkPhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           attachmentFeedbackLoopDynamicState;
+} VkPhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT;
+
+typedef void (VKAPI_PTR *PFN_vkCmdSetAttachmentFeedbackLoopEnableEXT)(VkCommandBuffer commandBuffer, VkImageAspectFlags aspectMask);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkCmdSetAttachmentFeedbackLoopEnableEXT(
+    VkCommandBuffer                             commandBuffer,
+    VkImageAspectFlags                          aspectMask);
+#endif
+
+
 #define VK_KHR_acceleration_structure 1
 #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 13
 #define VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME "VK_KHR_acceleration_structure"
diff --git a/icd/api/include/log.h b/icd/api/include/log.h
index 417fc8d5..9610cdef 100644
--- a/icd/api/include/log.h
+++ b/icd/api/include/log.h
@@ -63,13 +63,12 @@ static void AmdvlkLog(
       return;
     }
 
-    va_list argList;
-    va_start(argList, pFormatStr);
-    va_end(argList);
-
 #if PAL_ENABLE_PRINTS_ASSERTS
+    va_list argList;
     Util::DbgPrintf(Util::DbgPrintCatMsgFile, Util::DbgPrintStyleNoPrefixNoCrLf, "%s-", LogTag[tagId]);
+    va_start(argList, pFormatStr);
     Util::DbgVPrintf(Util::DbgPrintCatMsgFile, Util::DbgPrintStyleNoPrefixNoCrLf, pFormatStr, argList);
+    va_end(argList);
     Util::DbgPrintf(Util::DbgPrintCatMsgFile, Util::DbgPrintStyleNoPrefixNoCrLf, "\n");
 #endif
 }
diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h
index 6a4a1b37..962af3d2 100644
--- a/icd/api/include/pipeline_compiler.h
+++ b/icd/api/include/pipeline_compiler.h
@@ -140,6 +140,7 @@ class PipelineCompiler
     VkResult BuildShaderModule(
         const Device*                   pDevice,
         const VkShaderModuleCreateFlags flags,
+        const VkShaderModuleCreateFlags internalShaderFlags,
         size_t                          codeSize,
         const void*                     pCode,
         const bool                      adaptForFastLink,
@@ -169,6 +170,7 @@ class PipelineCompiler
 
     VkResult CreateGraphicsShaderBinary(
         const Device*                     pDevice,
+        PipelineCache*                    pPipelineCache,
         const ShaderStage                 stage,
         GraphicsPipelineBinaryCreateInfo* pCreateInfo,
         ShaderModuleHandle*               pModule);
@@ -209,11 +211,6 @@ class PipelineCompiler
         PipelineMetadata*                               pBinaryMetadata,
         GraphicsPipelineBinaryCreateInfo*               pCreateInfo);
 
-    static void SetPartialGraphicsPipelineBinaryInfo(
-        const ShaderModuleHandle*         pShaderModuleHandle,
-        const ShaderStage                 stage,
-        GraphicsPipelineBinaryCreateInfo* pCreateInfo);
-
     VkResult ConvertComputePipelineInfo(
         const Device*                                   pDevice,
         const VkComputePipelineCreateInfo*              pIn,
diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h
index 0340e8eb..76de0400 100644
--- a/icd/api/include/vk_cmdbuffer.h
+++ b/icd/api/include/vk_cmdbuffer.h
@@ -404,6 +404,7 @@ class CmdBuffer
     void BindIndexBuffer(
         VkBuffer                                    buffer,
         VkDeviceSize                                offset,
+        VkDeviceSize                                size,
         VkIndexType                                 indexType);
 
     void BindVertexBuffers(
@@ -1050,7 +1051,8 @@ class CmdBuffer
     void PalCmdBindIndexData(
                 Buffer* pBuffer,
                 Pal::gpusize offset,
-                Pal::IndexType indexType);
+                Pal::IndexType indexType,
+                Pal::gpusize bufferSize);
 
     void PalCmdUnbindIndexData(Pal::IndexType indexType);
 
@@ -1727,8 +1729,11 @@ class CmdBuffer
         GpuRt::DispatchRaysConstants*          pConstants);
 
     void BindRayQueryConstants(
-        const Pipeline* pPipeline,
-        Pal::PipelineBindPoint bindPoint);
+        const Pipeline*        pPipeline,
+        Pal::PipelineBindPoint bindPoint,
+        uint32_t               width,
+        uint32_t               height,
+        uint32_t               depth);
 #endif
 
     union CmdBufferFlags
diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h
index 58ea36cd..8592b435 100644
--- a/icd/api/include/vk_conv.h
+++ b/icd/api/include/vk_conv.h
@@ -539,6 +539,7 @@ inline Pal::TexFilter VkToPalTexFilter(
             break;
         default:
             VK_NOT_IMPLEMENTED;
+            break;
     }
 
     const Pal::XyFilter pointFilter  = (anisotropicEnabled != VK_FALSE) ? Pal::XyFilterAnisotropicPoint :
@@ -555,6 +556,7 @@ inline Pal::TexFilter VkToPalTexFilter(
             break;
         default:
             VK_NOT_IMPLEMENTED;
+            break;
     }
 
     switch (minFilter)
@@ -567,6 +569,7 @@ inline Pal::TexFilter VkToPalTexFilter(
             break;
         default:
             VK_NOT_IMPLEMENTED;
+            break;
     }
 
     return palTexFilter;
@@ -1226,17 +1229,15 @@ inline void VkToPalSubresRange(
     uint32_t*                       pPalSubresRangeIndex,
     const RuntimeSettings&          settings)
 {
-    constexpr uint32_t WHOLE_SIZE_UINT32 = (uint32_t)VK_WHOLE_SIZE;
+    // The minimums below are used for VkImageSubresourceRange VK_WHOLE_SIZE handling.
 
     Pal::SubresRange palSubresRange;
 
     palSubresRange.startSubres.arraySlice   = range.baseArrayLayer;
     palSubresRange.startSubres.mipLevel     = range.baseMipLevel;
     palSubresRange.numPlanes                = 1;
-    palSubresRange.numMips                  = (range.levelCount == WHOLE_SIZE_UINT32) ?
-        (mipLevels - range.baseMipLevel)   : range.levelCount;
-    palSubresRange.numSlices                = (range.layerCount == WHOLE_SIZE_UINT32) ?
-        (arraySize - range.baseArrayLayer) : range.layerCount;
+    palSubresRange.numMips                  = Util::Min(range.levelCount, (mipLevels - range.baseMipLevel));
+    palSubresRange.numSlices                = Util::Min(range.layerCount, (arraySize - range.baseArrayLayer));
 
     VkImageAspectFlags aspectMask = range.aspectMask;
     Pal::ChNumFormat palFormat = VkToPalFormat(format, settings).format;
@@ -1251,7 +1252,8 @@ inline void VkToPalSubresRange(
     {
         palSubresRange.startSubres.plane = VkToPalImagePlaneExtract(palFormat, &aspectMask);
         pPalSubresRanges[(*pPalSubresRangeIndex)++] = palSubresRange;
-    } while (aspectMask != 0);
+    }
+    while (aspectMask != 0);
 }
 
 // =====================================================================================================================
@@ -1570,69 +1572,6 @@ void VkToPalImageScaledCopyRegion(
     while (aspectMask != 0);
 }
 
-// =====================================================================================================================
-// Converts a Vulkan image-blit structure to one or more PAL color-space-conversion-region structures.
-inline Pal::ColorSpaceConversionRegion VkToPalImageColorSpaceConversionRegion(
-    const VkImageBlit&  imageBlit,
-    Pal::SwizzledFormat srcFormat,
-    Pal::SwizzledFormat dstFormat)
-{
-
-    Pal::ColorSpaceConversionRegion region = {};
-
-    // Color conversion blits can only happen between a YUV and an RGB image.
-    VK_ASSERT((Pal::Formats::IsYuv(srcFormat.format) && (Pal::Formats::IsYuv(dstFormat.format) == false)) ||
-              ((Pal::Formats::IsYuv(srcFormat.format) == false) && (Pal::Formats::IsYuv(dstFormat.format))));
-
-    const VkImageSubresourceLayers& rgbSubresource =
-        Pal::Formats::IsYuv(srcFormat.format) ? imageBlit.dstSubresource : imageBlit.srcSubresource;
-
-    const VkImageSubresourceLayers& yuvSubresource =
-        Pal::Formats::IsYuv(srcFormat.format) ? imageBlit.srcSubresource : imageBlit.dstSubresource;
-
-    // Convert values to temporary 3D variables as the PAL interface currently only accepts 2D
-    Pal::Offset3d       srcOffset = VkToPalOffset3d(imageBlit.srcOffsets[0]);
-    Pal::SignedExtent3d srcExtent = VkToPalSignedExtent3d(imageBlit.srcOffsets);
-    Pal::Offset3d       dstOffset = VkToPalOffset3d(imageBlit.dstOffsets[0]);
-    Pal::SignedExtent3d dstExtent = VkToPalSignedExtent3d(imageBlit.dstOffsets);
-
-    region.rgbSubres.plane      = 0;
-    region.rgbSubres.mipLevel   = rgbSubresource.mipLevel;
-    region.rgbSubres.arraySlice = rgbSubresource.baseArrayLayer;
-
-    VK_ASSERT(yuvSubresource.mipLevel == 0);
-
-    region.yuvStartSlice        = yuvSubresource.baseArrayLayer;
-
-    VK_ASSERT(imageBlit.srcSubresource.layerCount == imageBlit.dstSubresource.layerCount);
-    VK_ASSERT(srcExtent.depth == dstExtent.depth);
-
-    region.sliceCount = Util::Max<uint32_t>(srcExtent.depth, imageBlit.srcSubresource.layerCount);
-
-    // Write the 2D coordinates and ignore the 3rd dimension for now
-    region.srcOffset.x = srcOffset.x;
-    region.srcOffset.y = srcOffset.y;
-
-    VK_ASSERT(srcOffset.z == 0);
-
-    region.srcExtent.width  = srcExtent.width;
-    region.srcExtent.height = srcExtent.height;
-
-    VK_ASSERT(srcExtent.depth == 1);
-
-    region.dstOffset.x = dstOffset.x;
-    region.dstOffset.y = dstOffset.y;
-
-    VK_ASSERT(dstOffset.z == 0);
-
-    region.dstExtent.width  = dstExtent.width;
-    region.dstExtent.height = dstExtent.height;
-
-    VK_ASSERT(dstExtent.depth == 1);
-
-    return region;
-}
-
 // =====================================================================================================================
 // Converts a Vulkan image-resolve structure to one or more PAL image-resolve-region structures.
 template<typename ImageResolveType>
@@ -3945,12 +3884,14 @@ struct UberFetchShaderFormatInfo
 
 // =====================================================================================================================
 class UberFetchShaderFormatInfoMap :
-    public Util::HashMap<VkFormat, UberFetchShaderFormatInfo, PalAllocator, Util::JenkinsHashFunc>
+    public Util::HashMap<VkFormat, UberFetchShaderFormatInfo, PalAllocator, Util::JenkinsHashFunc,
+        Util::DefaultEqualFunc, Util::HashAllocator<PalAllocator>, 1024>
 {
 public:
     explicit UberFetchShaderFormatInfoMap(uint32 numBuckets, PalAllocator* const pAllocator)
         :
-        Util::HashMap<VkFormat, UberFetchShaderFormatInfo, PalAllocator, Util::JenkinsHashFunc>(numBuckets, pAllocator),
+        Util::HashMap<VkFormat, UberFetchShaderFormatInfo, PalAllocator, Util::JenkinsHashFunc, Util::DefaultEqualFunc,
+            Util::HashAllocator<PalAllocator>, 1024>(numBuckets, pAllocator),
         m_bufferFormatMask(0)
     { }
 
diff --git a/icd/api/include/vk_deferred_operation.h b/icd/api/include/vk_deferred_operation.h
index 3a7bd3f3..e4d37dee 100644
--- a/icd/api/include/vk_deferred_operation.h
+++ b/icd/api/include/vk_deferred_operation.h
@@ -57,7 +57,7 @@ typedef int32_t (*DeferredHostCallback)(Device*                pDevice,
 struct DeferredWorkload
 {
     uint32_t    nextInstance;          // Next workload instance to execute
-    uint32_t    completedInstances;    // # of workload instances fully executed
+    uint32_t    completedInstances;
     uint32_t    totalInstances;        // Actual # of workload instances (UINT_MAX if not yet known, 0 if no-op)
     uint32_t    maxInstances;          // Upper limit estimate of the # of instances (for when actual # is unavailable)
     void*       pPayloads;             // Array of payloads (per workload instance)
diff --git a/icd/api/include/vk_defines.h b/icd/api/include/vk_defines.h
index 2ee7d2d8..00a1182e 100644
--- a/icd/api/include/vk_defines.h
+++ b/icd/api/include/vk_defines.h
@@ -205,6 +205,8 @@ namespace vk
         PipelineCompilerTypeLlpc,  // Use shader compiler provided by LLPC
     };
 
+    // Point size must be set via gl_PointSize, otherwise it must be 1.0f
+    static const float DefaultPointSize = 1.0f;
 }// namespace vk
 
 #endif
diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h
index ad2b596e..5659c842 100644
--- a/icd/api/include/vk_device.h
+++ b/icd/api/include/vk_device.h
@@ -156,7 +156,12 @@ class Device
             uint32                dynamicPrimitiveTopologyUnrestricted : 1;
             uint32                graphicsPipelineLibrary              : 1;
             uint32                deviceMemoryReport                   : 1;
-            uint32                reserved                             : 18;
+            uint32                initializePointSizeInBegin           : 1;
+            uint32                deviceAddressBindingReport           : 1;
+            // True if EXT_DEVICE_MEMORY_REPORT or EXT_DEVICE_ADDRESS_BINDING_REPORT is enabled.
+            uint32                gpuMemoryEventHandler                : 1;
+            uint32                assumeDynamicTopologyInLibs          : 1;
+            uint32                reserved                             : 14;
         };
 
         uint32 u32All;
@@ -784,7 +789,7 @@ class Device
         const uint8_t*                 pCode,
         uint32_t                       numUserDataNodes,
         Vkgc::ResourceMappingRootNode* pUserDataNodes,
-        VkShaderModuleCreateFlags      flags,
+        VkShaderModuleCreateFlags      internalShaderFlags,
         bool                           forceWave64,
         const VkSpecializationInfo*    pSpecializationInfo,
         InternalPipeline*              pInternalPipeline);
diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h
index 4690bb44..8f77bc58 100644
--- a/icd/api/include/vk_extensions.h
+++ b/icd/api/include/vk_extensions.h
@@ -359,6 +359,7 @@ class DeviceExtensions final : public Extensions<DeviceExtensions>
         EXT_DEPTH_RANGE_UNRESTRICTED,
         EXT_DESCRIPTOR_BUFFER,
         EXT_DESCRIPTOR_INDEXING,
+        EXT_DEVICE_ADDRESS_BINDING_REPORT,
         EXT_DEVICE_FAULT,
         EXT_DEVICE_MEMORY_REPORT,
         EXT_EXTENDED_DYNAMIC_STATE,
@@ -414,6 +415,7 @@ class DeviceExtensions final : public Extensions<DeviceExtensions>
         EXT_SHADER_VIEWPORT_INDEX_LAYER,
         EXT_SUBGROUP_SIZE_CONTROL,
         EXT_TEXEL_BUFFER_ALIGNMENT,
+        EXT_TEXTURE_COMPRESSION_ASTC_HDR,
         EXT_TOOLING_INFO,
         EXT_TRANSFORM_FEEDBACK,
         EXT_VERTEX_ATTRIBUTE_DIVISOR,
diff --git a/icd/api/include/vk_graphics_pipeline_library.h b/icd/api/include/vk_graphics_pipeline_library.h
index 4ff4428b..d0b715a8 100644
--- a/icd/api/include/vk_graphics_pipeline_library.h
+++ b/icd/api/include/vk_graphics_pipeline_library.h
@@ -87,6 +87,7 @@ class GraphicsPipelineLibrary final : public GraphicsPipelineCommon, public NonD
 
     static VkResult CreatePartialPipelineBinary(
         const Device*                          pDevice,
+        PipelineCache*                         pPipelineCache,
         const VkGraphicsPipelineCreateInfo*    pCreateInfo,
         const GraphicsPipelineLibraryInfo*     pLibInfo,
         const GraphicsPipelineShaderStageInfo* pShaderStageInfo,
diff --git a/icd/api/include/vk_physical_device.h b/icd/api/include/vk_physical_device.h
index 928faedf..855a0e52 100644
--- a/icd/api/include/vk_physical_device.h
+++ b/icd/api/include/vk_physical_device.h
@@ -222,12 +222,6 @@ class PhysicalDevice
         return index;
     }
 
-    bool GetQueueGroupCompatible(
-        uint32_t queueFamilyIndex) const
-    {
-        return m_queueFamilies[queueFamilyIndex].flags.queueGroupCompatible;
-    }
-
     Pal::EngineType GetQueueFamilyPalEngineType(
         uint32_t queueFamilyIndex) const
     {
@@ -862,17 +856,6 @@ class PhysicalDevice
         VkShaderStageFlags           validShaderStages;
         uint32_t                     palImageLayoutFlag;
         VkQueueFamilyProperties      properties;
-
-        union
-        {
-            struct
-            {
-                uint32_t queueGroupCompatible : 1;
-                uint32_t reserved             : 31;
-            };
-            uint32_t u32All;
-        } flags;
-
     } m_queueFamilies[Queue::MaxQueueFamilies];
 
     // List of indices for compute engines that aren't exclusive.
diff --git a/icd/api/include/vk_pipeline.h b/icd/api/include/vk_pipeline.h
index 230cd427..575da33b 100644
--- a/icd/api/include/vk_pipeline.h
+++ b/icd/api/include/vk_pipeline.h
@@ -70,12 +70,13 @@ struct ShaderModuleHandle;
 // enabled.
 struct PipelineBinaryInfo
 {
-    static PipelineBinaryInfo* Create(size_t size, const void* pBinary, const VkAllocationCallbacks* pAllocator);
+    static PipelineBinaryInfo* Create(Util::MetroHash::Hash hash, size_t size, const void* pBinary, const VkAllocationCallbacks* pAllocator);
 
     void Destroy(const VkAllocationCallbacks* pAllocator);
 
     size_t binaryByteSize;
     void*  pBinary;
+    Util::MetroHash::Hash binaryHash;
 };
 
 enum class DynamicStatesInternal : uint32_t
diff --git a/icd/api/include/vk_pipeline_cache.h b/icd/api/include/vk_pipeline_cache.h
index fc403078..2791dbd5 100644
--- a/icd/api/include/vk_pipeline_cache.h
+++ b/icd/api/include/vk_pipeline_cache.h
@@ -36,13 +36,6 @@ namespace vk
 
 class Device;
 
-// Layout for pipeline cache private header, all fields are written with LSB first.
-struct PipelineCachePrivateHeaderData
-{
-    PipelineCompilerType cacheType;     // Cache data type
-    uint64_t blobSize[MaxPalDevices];   // Blob data size for each device
-};
-
 // =====================================================================================================================
 // Implementation of Vulkan pipeline cache object
 class PipelineCache final : public NonDispatchable<VkPipelineCache, PipelineCache>
@@ -60,12 +53,6 @@ class PipelineCache final : public NonDispatchable<VkPipelineCache, PipelineCach
 
     VkResult GetData(void* pData, size_t* pSize);
 
-    ShaderCache const& GetShaderCache(uint32_t deviceIdx) const
-    {
-        VK_ASSERT(deviceIdx < MaxPalDevices);
-        return m_shaderCaches[deviceIdx];
-    }
-
     VkResult Merge(uint32_t srcCacheCount, const PipelineCache** ppSrcCaches);
 
     PipelineBinaryCache* GetPipelineCache() const { return m_pBinaryCache; }
@@ -75,15 +62,11 @@ class PipelineCache final : public NonDispatchable<VkPipelineCache, PipelineCach
     }
 
 protected:
-    PipelineCache(const Device*  pDevice,
-            ShaderCache*         pShaderCaches,
-            PipelineBinaryCache* pBinaryCache
-            );
+    PipelineCache(const Device*  pDevice, PipelineBinaryCache* pBinaryCache);
 
     virtual ~PipelineCache();
 
     const Device*const  m_pDevice;
-    ShaderCache         m_shaderCaches[MaxPalDevices];
 
     PipelineBinaryCache* m_pBinaryCache;       // Pipeline binary cache object
 
diff --git a/icd/api/include/vk_shader.h b/icd/api/include/vk_shader.h
index 798417e2..3b8a206d 100644
--- a/icd/api/include/vk_shader.h
+++ b/icd/api/include/vk_shader.h
@@ -35,7 +35,7 @@
 namespace Pal { enum class ResourceMappingNodeType : Pal::uint32; }
 
 #if VKI_RAY_TRACING
-#define VK_SHADER_MODULE_RAY_TRACING_INTERNAL_SHADER_BIT 0x80000000u
+#define VK_INTERNAL_SHADER_FLAGS_RAY_TRACING_INTERNAL_SHADER_BIT 0x80000000u
 #endif
 
 namespace vk
diff --git a/icd/api/include/vk_utils.h b/icd/api/include/vk_utils.h
index 63a5dd48..d6cf42d9 100644
--- a/icd/api/include/vk_utils.h
+++ b/icd/api/include/vk_utils.h
@@ -144,6 +144,9 @@ inline uint64_t TicksToNano(uint64_t ticks)
 uint32_t GetBuildTimeHash();
 
 #if DEBUG
+// =====================================================================================================================
+// If turned on and exe name is a match, this function spins idle until we have a debugger hooked.
+void WaitIdleForDebugger(bool waitIdleToggled, const char* pWaitIdleExeName, uint32_t debugTimeout);
 #endif
 
 // =====================================================================================================================
diff --git a/icd/api/internal_mem_mgr.cpp b/icd/api/internal_mem_mgr.cpp
index b6956df6..9818b6de 100644
--- a/icd/api/internal_mem_mgr.cpp
+++ b/icd/api/internal_mem_mgr.cpp
@@ -627,25 +627,29 @@ VkResult InternalMemMgr::AllocGpuMem(
                     &pInternalMemory->m_offset);
             }
 
-            if ((result == VK_SUCCESS) &&
-                (m_pDevice->GetEnabledFeatures().deviceMemoryReport == true))
+            if (result == VK_SUCCESS)
             {
-                // Sub-allocation succeeded, either from an existing pool, or a new pool.  Report the allocation to
-                // device_memory_report.
-                Pal::IGpuMemory* pPalGpuMem = pInternalMemory->PalMemory(DefaultDeviceIndex);
-                auto*const pPhysicalDevice = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex);
-
-                uint32_t heapIndex = 0;
-                bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pPalGpuMem->Desc().heaps[0], &heapIndex);
-                VK_ASSERT(validHeap);
-
-                m_pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanSubAllocateEvent(
-                    pPalGpuMem,
-                    pInternalMemory->m_offset,
-                    pInternalMemory->m_size,
-                    requestingObjectHandle,
-                    requestingObjectType,
-                    heapIndex);
+                const Device::DeviceFeatures& deviceFeatures = m_pDevice->GetEnabledFeatures();
+                if (deviceFeatures.gpuMemoryEventHandler)
+                {
+                    // Sub-allocation succeeded, either from an existing pool, or a new pool.  Report the allocation to
+                    // GpuMemoryEventHandler.
+                    Pal::IGpuMemory* pPalGpuMem = pInternalMemory->PalMemory(DefaultDeviceIndex);
+                    auto* const pPhysicalDevice = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex);
+
+                    uint32_t heapIndex = 0;
+                    bool validHeap = pPhysicalDevice->GetVkHeapIndexFromPalHeap(pPalGpuMem->Desc().heaps[0], &heapIndex);
+                    VK_ASSERT(validHeap);
+
+                    m_pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanSubAllocateEvent(
+                        m_pDevice,
+                        pPalGpuMem,
+                        pInternalMemory->m_offset,
+                        pInternalMemory->m_size,
+                        requestingObjectHandle,
+                        requestingObjectType,
+                        heapIndex);
+                }
             }
         }
     }
@@ -798,17 +802,20 @@ void InternalMemMgr::FreeGpuMem(
 
     if (pInternalMemory->m_memoryPool.pBuddyAllocator != nullptr)
     {
+        const Device::DeviceFeatures& deviceFeatures = m_pDevice->GetEnabledFeatures();
+
         // The memory was suballocated so free it using the buddy allocator
         pInternalMemory->m_memoryPool.pBuddyAllocator->Free(
             pInternalMemory->m_offset,
             pInternalMemory->m_size,
             pInternalMemory->m_alignment);
 
-        if (m_pDevice->GetEnabledFeatures().deviceMemoryReport == true)
+        if (deviceFeatures.gpuMemoryEventHandler)
         {
             Pal::IGpuMemory* pPalGpuMem = pInternalMemory->PalMemory(DefaultDeviceIndex);
 
             m_pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanSubFreeEvent(
+                m_pDevice,
                 pPalGpuMem,
                 pInternalMemory->m_offset);
         }
diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp
index 05c0cab1..15fb39fd 100644
--- a/icd/api/pipeline_compiler.cpp
+++ b/icd/api/pipeline_compiler.cpp
@@ -73,19 +73,21 @@ static bool IsDynamicStateEnabled(const uint64_t dynamicStateFlags, const Dynami
 // =====================================================================================================================
 // Populates shaderLibrary input flags according to settings
 static uint32_t GpuRtShaderLibraryFlags(
-    const PhysicalDevice* pDevice)
+    const Device* pDevice)
 {
-    const RuntimeSettings& settings = pDevice->GetRuntimeSettings();
+    const RuntimeSettings& settings        = pDevice->GetRuntimeSettings();
+    GpuRt::TraceRayCounterMode counterMode = pDevice->RayTrace()->TraceRayCounterMode(DefaultDeviceIndex);
 
     uint32_t flags = 0;
 
-    if ((settings.rtTraceRayCounterMode != TraceRayCounterDisable) ||
+    if ((counterMode != GpuRt::TraceRayCounterMode::TraceRayCounterDisable) ||
         (settings.rtTraceRayProfileFlags != TraceRayProfileDisable))
     {
         flags |= static_cast<uint32>(GpuRt::ShaderLibraryFeatureFlag::Developer);
     }
 
-    if (settings.emulatedRtIpLevel > HardwareRtIpLevel1_1)
+    if ((settings.emulatedRtIpLevel > HardwareRtIpLevel1_1)
+        )
     {
         flags |= static_cast<uint32>(GpuRt::ShaderLibraryFeatureFlag::SoftwareTraversal);
     }
@@ -167,9 +169,8 @@ void PipelineCompiler::GetElfCacheMetricString(
     size_t                      outStrSize)
 {
     const int64_t freq = Util::GetPerfFrequency();
-
-    const int64_t avgUs = pCacheMatrix->totalBinaries > 0 ?
-        ((pCacheMatrix->totalTimeSpent / pCacheMatrix->totalBinaries) * 1000000) / freq :
+    const int64_t avgUs = (pCacheMatrix->totalBinaries + pCacheMatrix->cacheHits) > 0 ?
+        ((pCacheMatrix->totalTimeSpent / (pCacheMatrix->totalBinaries + pCacheMatrix->cacheHits)) * 1000000) / freq :
         0;
     const double  avgMs = avgUs / 1000.0;
 
@@ -182,13 +183,13 @@ void PipelineCompiler::GetElfCacheMetricString(
 
     static constexpr char metricFmtString[] =
         "%s\n"
-        "Cache hit rate - %0.1f%%\n"
-        "Total request count - %d\n"
+        "Cache hit rate - %0.1f%% (%d/%d)\n"
+        "Total new binary - %d\n"
         "Total time spent - %0.1f ms\n"
         "Average time spent per request - %0.3f ms\n\n";
 
-    Util::Snprintf(pOutStr, outStrSize, metricFmtString,
-        pPrefixStr, hitRate * 100, pCacheMatrix->totalBinaries, totalMs, avgMs);
+    Util::Snprintf(pOutStr, outStrSize, metricFmtString, pPrefixStr, hitRate * 100, pCacheMatrix->cacheHits,
+        pCacheMatrix->cacheAttempts, pCacheMatrix->totalBinaries, totalMs, avgMs);
 }
 
 // =====================================================================================================================
@@ -315,15 +316,11 @@ VkResult PipelineCompiler::Initialize()
 
         // This isn't a terminal failure, the device can continue without the pipeline cache if need be.
         VK_ALERT(m_pBinaryCache == nullptr);
-        if (m_pBinaryCache != nullptr)
-        {
-            pCacheAdapter = m_pBinaryCache->GetCacheAdapter();
-        }
     }
 
     if (result == VK_SUCCESS)
     {
-        result = m_compilerSolutionLlpc.Initialize(m_gfxIp, info.gfxLevel, pCacheAdapter);
+        result = m_compilerSolutionLlpc.Initialize(m_gfxIp, info.gfxLevel, m_pBinaryCache);
     }
 
     if (result == VK_SUCCESS)
@@ -369,7 +366,7 @@ void PipelineCompiler::Destroy()
    auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
 
     Util::MutexAuto mutexLock(&m_cacheLock);
-    if (m_pPhysicalDevice->GetRuntimeSettings().enableEarlyCompile)
+    if (SupportInternalModuleCache(m_pPhysicalDevice, GetCompilerCollectionMask()))
     {
         for (auto it = m_shaderModuleHandleMap.Begin(); it.Get() != nullptr; it.Next())
         {
@@ -397,38 +394,6 @@ void PipelineCompiler::Destroy()
     m_uberFetchShaderInternalDataMap.Reset();
 }
 
-// =====================================================================================================================
-// Creates shader cache object.
-VkResult PipelineCompiler::CreateShaderCache(
-    const void*   pInitialData,
-    size_t        initialDataSize,
-    uint32_t      expectedEntries,
-    void*         pShaderCacheMem,
-    ShaderCache*  pShaderCache)
-{
-    VkResult                     result         = VK_SUCCESS;
-
-    return result;
-}
-
-// =====================================================================================================================
-// Gets the size of shader cache object.
-size_t PipelineCompiler::GetShaderCacheSize(
-    PipelineCompilerType cacheType)
-{
-    size_t shaderCacheSize = 0;
-    return shaderCacheSize;
-}
-
-// =====================================================================================================================
-// Gets shader cache type.
-PipelineCompilerType PipelineCompiler::GetShaderCacheType()
-{
-    PipelineCompilerType cacheType;
-    cacheType = PipelineCompilerTypeLlpc;
-    return cacheType;
-}
-
 // =====================================================================================================================
 // Loads shader binary  from replace shader folder with specified shader hash code.
 bool PipelineCompiler::LoadReplaceShaderBinary(
@@ -589,6 +554,7 @@ void PipelineCompiler::StoreShaderModuleToCache(
 
     if ((pBinaryCache != nullptr) || supportInternalModuleCache)
     {
+
         const Util::MetroHash::Hash shaderModuleCacheHash =
             GetShaderModuleCacheHash(flags, compilerMask, uniqueHash);
 
@@ -628,6 +594,7 @@ void PipelineCompiler::StoreShaderModuleToCache(
 VkResult PipelineCompiler::BuildShaderModule(
     const Device*                   pDevice,
     const VkShaderModuleCreateFlags flags,
+    const VkShaderModuleCreateFlags internalShaderFlags,
     size_t                          codeSize,
     const void*                     pCode,
     const bool                      adaptForFastLink,
@@ -670,15 +637,27 @@ VkResult PipelineCompiler::BuildShaderModule(
     result = LoadShaderModuleFromCache(
         pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pFeedback, pShaderModule);
 
+    VkShaderModuleCreateFlags internalFlags = internalShaderFlags;
+
     if (result != VK_SUCCESS)
     {
         if (compilerMask & (1 << PipelineCompilerTypeLlpc))
         {
             result = m_compilerSolutionLlpc.BuildShaderModule(
-                pDevice, flags, codeSize, pCode, adaptForFastLink, isInternal, pShaderModule, PipelineOptimizerKey{});
+                pDevice,
+                flags,
+                internalFlags,
+                codeSize,
+                pCode,
+                adaptForFastLink,
+                isInternal,
+                pShaderModule,
+                PipelineOptimizerKey{});
         }
 
-        StoreShaderModuleToCache(pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pShaderModule);
+        {
+            StoreShaderModuleToCache(pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pShaderModule);
+        }
     }
     else
     {
@@ -744,12 +723,21 @@ void PipelineCompiler::FreeShaderModule(
         {
             m_compilerSolutionLlpc.FreeShaderModule(pShaderModule);
             auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+            if (pShaderModule->elfPackage.codeSize > 0)
+            {
+                pInstance->FreeMem(const_cast<void*>(pShaderModule->elfPackage.pCode));
+            }
             pInstance->FreeMem(pShaderModule->pRefCount);
         }
     }
     else
     {
         m_compilerSolutionLlpc.FreeShaderModule(pShaderModule);
+        auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+        if (pShaderModule->elfPackage.codeSize > 0)
+        {
+            pInstance->FreeMem(const_cast<void*>(pShaderModule->elfPackage.pCode));
+        }
     }
 }
 
@@ -815,7 +803,7 @@ bool PipelineCompiler::ReplacePipelineShaderModule(
         if (LoadReplaceShaderBinary(hash64, &codeSize, &pCode))
         {
             VkResult result =
-                BuildShaderModule(pDevice, 0, codeSize, pCode, false, false, nullptr, nullptr, pShaderModule);
+                BuildShaderModule(pDevice, 0, 0, codeSize, pCode, false, false, nullptr, nullptr, pShaderModule);
 
             if (result == VK_SUCCESS)
             {
@@ -1013,20 +1001,18 @@ Util::Result PipelineCompiler::GetCachedPipelineBinary(
             *pIsInternalCacheHit = true;
         }
     }
+    m_pipelineCacheMatrix.totalTimeSpent += Util::GetPerfCpuTime() - startTime;
     if (*pIsUserCacheHit || *pIsInternalCacheHit)
     {
         *pFreeCompilerBinary = FreeWithInstanceAllocator;
         cacheResult = Util::Result::Success;
         m_pipelineCacheMatrix.cacheHits++;
+        DumpCacheMatrix(m_pPhysicalDevice,
+            "Pipeline_runtime",
+            m_pipelineCacheMatrix.totalBinaries + m_pipelineCacheMatrix.cacheHits,
+            &m_pipelineCacheMatrix);
     }
 
-    m_pipelineCacheMatrix.totalTimeSpent += Util::GetPerfCpuTime() - startTime;
-
-    DumpCacheMatrix(m_pPhysicalDevice,
-        "Pipeline_runtime",
-        m_pipelineCacheMatrix.totalBinaries + m_pipelineCacheMatrix.cacheHits,
-        &m_pipelineCacheMatrix);
-
     return cacheResult;
 }
 
@@ -1145,11 +1131,11 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary(
         dumpOptions.dumpDuplicatePipelines    = settings.dumpDuplicatePipelines;
 
         Vkgc::PipelineBuildInfo pipelineInfo = {};
-        pipelineInfo.pGraphicsInfo           = &pCreateInfo->pipelineInfo;
-        uint64_t dumpHash                    = pipelineHash;
-        pPipelineDumpHandle                  = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions,
-                                                                                        pipelineInfo,
-                                                                                        dumpHash);
+        pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo;
+        uint64_t dumpHash          = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : pipelineHash;
+        pPipelineDumpHandle        = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions,
+                                                                              pipelineInfo,
+                                                                              dumpHash);
     }
 
     if (shouldCompile && (result == VK_SUCCESS))
@@ -1222,18 +1208,22 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary(
 // Create ISA/relocatable shader for a specific shader based on pipeline information
 VkResult PipelineCompiler::CreateGraphicsShaderBinary(
     const Device*                     pDevice,
+    PipelineCache*                    pPipelineCache,
     const ShaderStage                 stage,
     GraphicsPipelineBinaryCreateInfo* pCreateInfo,
     ShaderModuleHandle*               pModule)
 {
     VkResult result = VK_SUCCESS;
+
     const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings();
     const uint32_t compilerMask = GetCompilerCollectionMask();
-    void* pPipelineDumpHandle = nullptr;
+    uint64_t libraryHash = Vkgc::IPipelineDumper::GetGraphicsShaderBinaryHash(&pCreateInfo->pipelineInfo, stage);
+    pCreateInfo->libraryHash[stage] = libraryHash;
 
+    void* pPipelineDumpHandle = nullptr;
     if (settings.enablePipelineDump)
     {
-        uint64_t dumpHash = Vkgc::IPipelineDumper::GetGraphicsShaderBinaryHash(&pCreateInfo->pipelineInfo, stage);
+        uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : libraryHash;
 
         Vkgc::PipelineDumpOptions dumpOptions = {};
         dumpOptions.pDumpDir                  = settings.pipelineDumpDir;
@@ -1252,6 +1242,7 @@ VkResult PipelineCompiler::CreateGraphicsShaderBinary(
     {
         result = m_compilerSolutionLlpc.CreateGraphicsShaderBinary(
                 pDevice,
+                pPipelineCache,
                 stage,
                 pCreateInfo,
                 pPipelineDumpHandle,
@@ -1323,8 +1314,16 @@ VkResult PipelineCompiler::CreateComputePipelineBinary(
             }
         }
     }
+    if (shouldCompile)
+    {
+        if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) &&
+            (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT))
+        {
+            result = VK_PIPELINE_COMPILE_REQUIRED_EXT;
+        }
+    }
 
-    if (settings.enablePipelineDump)
+    if (settings.enablePipelineDump && (result == VK_SUCCESS))
     {
         Vkgc::PipelineDumpOptions dumpOptions = {};
         dumpOptions.pDumpDir                 = settings.pipelineDumpDir;
@@ -1334,37 +1333,30 @@ VkResult PipelineCompiler::CreateComputePipelineBinary(
 
         Vkgc::PipelineBuildInfo pipelineInfo = {};
         pipelineInfo.pComputeInfo = &pCreateInfo->pipelineInfo;
-        pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, pipelineHash);
+        uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : pipelineHash;
+        pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, dumpHash);
     }
 
-    if (shouldCompile)
+    if (shouldCompile && (result == VK_SUCCESS))
     {
-        if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) &&
-            (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT))
+        if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc)
         {
-            result = VK_PIPELINE_COMPILE_REQUIRED_EXT;
+            result = m_compilerSolutionLlpc.CreateComputePipelineBinary(
+                pDevice,
+                deviceIdx,
+                pPipelineCache,
+                pCreateInfo,
+                pPipelineBinarySize,
+                ppPipelineBinary,
+                pPipelineDumpHandle,
+                pipelineHash,
+                pCacheId,
+                &compileTime);
         }
-        else
-        {
-            if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc)
-            {
-                result = m_compilerSolutionLlpc.CreateComputePipelineBinary(
-                    pDevice,
-                    deviceIdx,
-                    pPipelineCache,
-                    pCreateInfo,
-                    pPipelineBinarySize,
-                    ppPipelineBinary,
-                    pPipelineDumpHandle,
-                    pipelineHash,
-                    pCacheId,
-                    &compileTime);
-            }
 
-            if (result == VK_SUCCESS)
-            {
-                pCreateInfo->freeCompilerBinary = FreeWithCompiler;
-            }
+        if (result == VK_SUCCESS)
+        {
+            pCreateInfo->freeCompilerBinary = FreeWithCompiler;
         }
     }
 
@@ -1610,7 +1602,10 @@ static void CopyPipelineShadersInfo(
         if ((shaderMask & (1 << stage)) != 0)
         {
             *pShaderInfosDst[stage]                           = *pShaderInfosSrc[stage];
+
             pCreateInfo->earlyElfPackage[stage]               = libInfo.earlyElfPackage[stage];
+            pCreateInfo->earlyElfPackageHash[stage]           = libInfo.earlyElfPackageHash[stage];
+            pCreateInfo->libraryHash[stage]                   = libInfo.libraryHash[stage];
         }
     }
 }
@@ -2037,6 +2032,7 @@ static VkResult BuildPipelineResourceMapping(
 
     if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0))
     {
+        pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash();
 
         size_t genericMappingBufferSize = pLayout->GetPipelineInfo()->mappingBufferSize;
 
@@ -2217,7 +2213,7 @@ static void BuildColorBlendState(
                                                        VK_COLOR_COMPONENT_B_BIT |
                                                        VK_COLOR_COMPONENT_A_BIT;
 
-                if (pCb != nullptr)
+                if ((pCb != nullptr) && (i < pCb->attachmentCount))
                 {
                     const VkPipelineColorBlendAttachmentState& src = pCb->pAttachments[i];
                     if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorWriteMask) == false)
@@ -2289,7 +2285,14 @@ static void BuildColorBlendState(
             }
         }
 
-        if (pCb != nullptr)
+        if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEquation) ||
+            IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::ColorBlendEnable))
+        {
+            {
+                dualSourceBlendEnabled = true;
+            }
+        }
+        else if (pCb != nullptr)
         {
             dualSourceBlendEnabled = GraphicsPipelineCommon::GetDualSourceBlendEnableState(pDevice, pCb);
         }
@@ -2361,8 +2364,9 @@ static void BuildPreRasterizationShaderState(
     const RenderPass* pRenderPass                  = RenderPass::ObjectFromHandle(pIn->renderPass);
     bool              isConservativeOverestimation = false;
     bool              unrestrictedPrimitiveTopology =
-        IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::PrimitiveTopology) &&
-        pDevice->GetEnabledFeatures().dynamicPrimitiveTopologyUnrestricted;
+        pDevice->GetEnabledFeatures().assumeDynamicTopologyInLibs ||
+        (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::PrimitiveTopology) &&
+        pDevice->GetEnabledFeatures().dynamicPrimitiveTopologyUnrestricted);
 
     BuildRasterizationState(pIn->pRasterizationState, dynamicStateFlags, &isConservativeOverestimation, pCreateInfo);
 
@@ -2548,7 +2552,7 @@ static void BuildExecutablePipelineState(
     {
         pDefaultCompiler->SetRayTracingState(pDevice, &(pCreateInfo->pipelineInfo.rtState), 0);
 
-        uint32_t flags = GpuRtShaderLibraryFlags(pDevice->VkPhysicalDevice(DefaultDeviceIndex));
+        uint32_t flags = GpuRtShaderLibraryFlags(pDevice);
 
         const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(flags);
 
@@ -2692,13 +2696,16 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo(
                 pCreateInfo->pipelineInfo.enableUberFetchShader = false;
             }
 
+            if (libInfo.flags.isLibrary)
+            {
+                pCreateInfo->pipelineInfo.unlinked = true;
+            }
             if (libInfo.flags.isLibrary)
             {
                 auto pPipelineBuildInfo = &pCreateInfo->pipelineInfo;
                 pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput;
                 auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
                 pPipelineBuildInfo->pInstance = pInstance;
-                pPipelineBuildInfo->unlinked = true;
                 CompilerSolution::DisableNggCulling(&pPipelineBuildInfo->nggState);
             }
 
@@ -2715,16 +2722,6 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo(
     return result;
 }
 
-// =====================================================================================================================
-// Fill partial pipeline binary info in GraphicsPipelineBinaryCreateInfo
-void PipelineCompiler::SetPartialGraphicsPipelineBinaryInfo(
-    const ShaderModuleHandle*         pShaderModuleHandle,
-    const ShaderStage                 stage,
-    GraphicsPipelineBinaryCreateInfo* pCreateInfo)
-{
-    pCreateInfo->earlyElfPackage[stage] = pShaderModuleHandle->elfPackage;
-}
-
 // =====================================================================================================================
 // Checks which compiler is used
 template<class PipelineBuildInfo>
@@ -2814,6 +2811,8 @@ void PipelineCompiler::ApplyPipelineOptions(
     pOptions->threadGroupSwizzleMode =
         static_cast<Vkgc::ThreadGroupSwizzleMode>(settings.forceCsThreadGroupSwizzleMode);
 
+    pOptions->enableImplicitInvariantExports = (settings.disableImplicitInvariantExports == false);
+
     pOptions->reverseThreadGroup = settings.enableAlternatingThreadGroupOrder;
 
     if (pDevice->GetEnabledFeatures().robustBufferAccessExtended)
@@ -2888,6 +2887,8 @@ VkResult PipelineCompiler::ConvertComputePipelineInfo(
         }
         else
         {
+            pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash();
+
             pCreateInfo->pMappingBuffer = Util::VoidPtrInc(pCreateInfo->pTempBuffer, genericMappingBufferSize);
 
             // NOTE: Zero the allocated space that is used to create pipeline resource mappings. Some
@@ -2932,7 +2933,7 @@ VkResult PipelineCompiler::ConvertComputePipelineInfo(
 
         if (pDevice->RayTrace() != nullptr)
         {
-            uint32_t flags = GpuRtShaderLibraryFlags(pDevice->VkPhysicalDevice(DefaultDeviceIndex));
+            uint32_t flags = GpuRtShaderLibraryFlags(pDevice);
 
             const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(flags);
             VK_ASSERT(codePatch.dxilSize > 0);
@@ -3251,6 +3252,8 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo(
 
         if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0))
         {
+            pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash();
+
             pCreateInfo->pMappingBuffer = pCreateInfo->pTempBuffer;
             tempBufferOffset           += pCreateInfo->mappingBufferSize;
 
@@ -3350,7 +3353,7 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo(
             pCreateInfo->allowShaderInlining = false;
         }
 
-        uint32_t flags = GpuRtShaderLibraryFlags(pDevice->VkPhysicalDevice(DefaultDeviceIndex));
+        uint32_t flags = GpuRtShaderLibraryFlags(pDevice);
 
         const GpuRt::PipelineShaderCode codePatch = GpuRt::GetShaderLibraryCode(flags);
         VK_ASSERT(codePatch.dxilSize > 0);
@@ -3434,7 +3437,17 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary(
 
     bool shaderModuleReplaced = false;
 
-    if (settings.enablePipelineDump)
+    if (shouldCompile)
+    {
+        if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) &&
+            (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT))
+        {
+            result = VK_PIPELINE_COMPILE_REQUIRED_EXT;
+        }
+
+    }
+
+    if (settings.enablePipelineDump && (result == VK_SUCCESS))
     {
         Vkgc::PipelineDumpOptions dumpOptions = {};
 
@@ -3446,8 +3459,9 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary(
         Vkgc::PipelineBuildInfo pipelineInfo = {};
 
         pipelineInfo.pRayTracingInfo = &pCreateInfo->pipelineInfo;
+        uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : pipelineHash;
         pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(
-            &dumpOptions, pipelineInfo, pipelineHash);
+            &dumpOptions, pipelineInfo, dumpHash);
     }
 
     uint32_t shaderCount = pCreateInfo->pipelineInfo.shaderCount;
@@ -3498,44 +3512,36 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary(
         }
     }
 
-    if (shouldCompile)
+    if (shouldCompile && (result == VK_SUCCESS))
     {
-        if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) &&
-            (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT))
-        {
-            result = VK_PIPELINE_COMPILE_REQUIRED_EXT;
-        }
-        else
+        if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc)
         {
-            if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc)
-            {
-                int64_t startTime = Util::GetPerfCpuTime();
+            int64_t startTime = Util::GetPerfCpuTime();
 
-                // Build the LLPC pipeline
-                Llpc::RayTracingPipelineBuildOut pipelineOut = {};
+            // Build the LLPC pipeline
+            Llpc::RayTracingPipelineBuildOut pipelineOut = {};
 
-                // Fill pipeline create info for LLPC
-                pPipelineBuildInfo->pInstance      = pInstance;
-                pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput;
+            // Fill pipeline create info for LLPC
+            pPipelineBuildInfo->pInstance      = pInstance;
+            pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput;
 
-                result = m_compilerSolutionLlpc.CreateRayTracingPipelineBinary(
-                    pDevice,
-                    deviceIdx,
-                    pPipelineCache,
-                    pCreateInfo,
-                    pPipelineBinary,
-                    pPipelineDumpHandle,
-                    pipelineHash,
-                    pCacheId,
-                    &compileTime);
-
-                compileTime = Util::GetPerfCpuTime() - startTime;
-            }
+            result = m_compilerSolutionLlpc.CreateRayTracingPipelineBinary(
+                pDevice,
+                deviceIdx,
+                pPipelineCache,
+                pCreateInfo,
+                pPipelineBinary,
+                pPipelineDumpHandle,
+                pipelineHash,
+                pCacheId,
+                &compileTime);
 
-            if (result == VK_SUCCESS)
-            {
-                pCreateInfo->freeCompilerBinary = FreeWithCompiler;
-            }
+            compileTime = Util::GetPerfCpuTime() - startTime;
+        }
+
+        if (result == VK_SUCCESS)
+        {
+            pCreateInfo->freeCompilerBinary = FreeWithCompiler;
         }
     }
 
@@ -3705,7 +3711,8 @@ void PipelineCompiler::SetRayTracingState(
                                                    &pRtState->staticPipelineFlags,
                                                    &pRtState->triCompressMode,
                                                    &pRtState->counterMode,
-                                                   pRtState->pipelineFlags);
+                                                   pRtState->pipelineFlags
+    );
 
     // Set the indirect function calling convention and callee saved registers per shader type from settings
     pRtState->exportConfig.indirectCallingConvention = settings.indirectCallConvention;
@@ -3760,6 +3767,10 @@ void PipelineCompiler::SetRayTracingState(
     pRtState->ldsSizePerThreadGroup                 = deviceProp.gfxipProperties.shaderCore.ldsSizePerThreadGroup;
     pRtState->maxRayLength                          = settings.rtMaxRayLength;
 
+    // Enables trace ray staticId and parentId handling (necessary for ray history dumps)
+    auto rtCounterMode = pDevice->RayTrace()->TraceRayCounterMode(DefaultDeviceIndex);
+    pRtState->enableRayTracingCounters = (rtCounterMode != GpuRt::TraceRayCounterMode::TraceRayCounterDisable);
+
 #if VKI_BUILD_GFX11
     // Enable hardware traversal stack on RTIP 2.0+
     if (settings.emulatedRtIpLevel > EmulatedRtIpLevel1_1)
diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp
index e7e75890..06212a1f 100644
--- a/icd/api/raytrace/ray_tracing_device.cpp
+++ b/icd/api/raytrace/ray_tracing_device.cpp
@@ -27,6 +27,7 @@
 #include "raytrace/ray_tracing_device.h"
 #include "raytrace/ray_tracing_util.h"
 #include "raytrace/vk_acceleration_structure.h"
+#include "raytrace/vk_ray_tracing_pipeline.h"
 #include "include/vk_cmdbuffer.h"
 #include "include/vk_device.h"
 #include "include/vk_shader.h"
@@ -98,6 +99,24 @@ VkResult RayTracingDevice::Init()
             initInfo.pAccelStructTracker       = GetAccelStructTracker(deviceIdx);
             initInfo.accelStructTrackerGpuAddr = GetAccelStructTrackerGpuVa(deviceIdx);
 
+            initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::None;
+            switch (m_pDevice->GetRuntimeSettings().emulatedRtIpLevel)
+            {
+            case EmulatedRtIpLevelNone:
+                break;
+            case HardwareRtIpLevel1_1:
+            case EmulatedRtIpLevel1_1:
+                initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::RtIp1_1;
+                break;
+#if VKI_BUILD_GFX11
+            case EmulatedRtIpLevel2_0:
+                initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::RtIp2_0;
+                break;
+#endif
+            default:
+                break;
+            }
+
             GpuRt::ClientCallbacks callbacks             = {};
             callbacks.pfnInsertRGPMarker                 = &RayTracingDevice::ClientInsertRGPMarker;
             callbacks.pfnConvertAccelStructBuildGeometry =
@@ -179,6 +198,7 @@ void RayTracingDevice::CreateGpuRtDeviceSettings(
     pDeviceSettings->bvhCpuBuildModeFastBuild          = static_cast<GpuRt::BvhCpuBuildMode>(settings.rtBvhCpuBuildMode);
     pDeviceSettings->enableTriangleSplitting           = settings.rtEnableTriangleSplitting;
     pDeviceSettings->triangleSplittingFactor           = settings.rtTriangleSplittingFactor;
+    pDeviceSettings->enableFusedInstanceNode           = settings.enableFusedInstanceNode;
     pDeviceSettings->rebraidFactor                     = settings.rebraidFactor;
     pDeviceSettings->rebraidLengthPercentage           = settings.rebraidLengthPercentage;
     pDeviceSettings->maxTopDownBuildInstances          = settings.maxTopDownBuildInstances;
@@ -261,6 +281,16 @@ bool RayTracingDevice::AccelStructTrackerEnabled(
              m_pGpuRtDevice[deviceIdx]->AccelStructTraceEnabled());
 }
 
+// =====================================================================================================================
+GpuRt::TraceRayCounterMode RayTracingDevice::TraceRayCounterMode(
+    uint32_t deviceIdx) const
+{
+    // If the PAL trace path is enabled, then force RayHistoryLight
+    return m_pGpuRtDevice[deviceIdx]->RayHistoryTraceAvailable() ?
+            GpuRt::TraceRayCounterMode::TraceRayCounterRayHistoryLight :
+            static_cast<GpuRt::TraceRayCounterMode>(m_pDevice->GetRuntimeSettings().rtTraceRayCounterMode);
+}
+
 // =====================================================================================================================
 GpuRt::AccelStructTracker* RayTracingDevice::GetAccelStructTracker(
     uint32_t deviceIdx) const
@@ -513,6 +543,122 @@ uint64_t RayTracingDevice::GetAccelerationStructureUUID(
     return static_cast<uint64_t>(gfxip) << 32 | vk::utils::GetBuildTimeHash();
 }
 
+// =====================================================================================================================
+void RayTracingDevice::SetDispatchInfo(
+    GpuRt::RtPipelineType                  pipelineType,
+    uint32_t                               width,
+    uint32_t                               height,
+    uint32_t                               depth,
+    uint32_t                               shaderCount,
+    uint64_t                               apiHash,
+    const VkStridedDeviceAddressRegionKHR* pRaygenSbt,
+    const VkStridedDeviceAddressRegionKHR* pMissSbt,
+    const VkStridedDeviceAddressRegionKHR* pHitSbt,
+    GpuRt::RtDispatchInfo*                 pDispatchInfo) const
+{
+    const RuntimeSettings& settings    = m_pDevice->GetRuntimeSettings();
+    GpuRt::RtDispatchInfo dispatchInfo = {};
+
+    dispatchInfo.dimX                = width;
+    dispatchInfo.dimY                = height;
+    dispatchInfo.dimZ                = depth;
+
+    dispatchInfo.pipelineShaderCount = shaderCount;
+    dispatchInfo.stateObjectHash     = apiHash;
+
+    dispatchInfo.boxSortMode         = settings.boxSortingHeuristic;
+#if VKI_BUILD_GFX11
+    dispatchInfo.usesNodePtrFlags    = settings.rtEnableNodePointerFlags ? 1 : 0;
+#endif
+
+    if (pipelineType == GpuRt::RtPipelineType::RayTracing)
+    {
+        dispatchInfo.raygenShaderTable.addr   = static_cast<Pal::gpusize>(pRaygenSbt->deviceAddress);
+        dispatchInfo.raygenShaderTable.size   = static_cast<Pal::gpusize>(pRaygenSbt->size);
+        dispatchInfo.raygenShaderTable.stride = static_cast<Pal::gpusize>(pRaygenSbt->stride);
+
+        dispatchInfo.missShaderTable.addr     = static_cast<Pal::gpusize>(pMissSbt->deviceAddress);
+        dispatchInfo.missShaderTable.size     = static_cast<Pal::gpusize>(pMissSbt->size);
+        dispatchInfo.missShaderTable.stride   = static_cast<Pal::gpusize>(pMissSbt->stride);
+
+        dispatchInfo.hitGroupTable.addr       = static_cast<Pal::gpusize>(pHitSbt->deviceAddress);
+        dispatchInfo.hitGroupTable.size       = static_cast<Pal::gpusize>(pHitSbt->size);
+        dispatchInfo.hitGroupTable.stride     = static_cast<Pal::gpusize>(pHitSbt->stride);
+    }
+
+    (*pDispatchInfo) = dispatchInfo;
+}
+
+// =====================================================================================================================
+void RayTracingDevice::TraceDispatch(
+    uint32_t                               deviceIdx,
+    Pal::ICmdBuffer*                       pPalCmdBuffer,
+    GpuRt::RtPipelineType                  pipelineType,
+    uint32_t                               width,
+    uint32_t                               height,
+    uint32_t                               depth,
+    uint32_t                               shaderCount,
+    uint64_t                               apiHash,
+    const VkStridedDeviceAddressRegionKHR* pRaygenSbt,
+    const VkStridedDeviceAddressRegionKHR* pMissSbt,
+    const VkStridedDeviceAddressRegionKHR* pHitSbt,
+    GpuRt::DispatchRaysConstants*          pConstants)
+{
+    if (m_pGpuRtDevice[deviceIdx]->RayHistoryTraceActive())
+    {
+        GpuRt::RtDispatchInfo dispatchInfo = {};
+        SetDispatchInfo(pipelineType,
+                        width,
+                        height,
+                        depth,
+                        shaderCount,
+                        apiHash,
+                        pRaygenSbt,
+                        pMissSbt,
+                        pHitSbt,
+                        &dispatchInfo);
+
+        m_pGpuRtDevice[deviceIdx]->TraceRtDispatch(pPalCmdBuffer,
+                                                   pipelineType,
+                                                   dispatchInfo,
+                                                   pConstants);
+    }
+}
+
+// =====================================================================================================================
+void RayTracingDevice::TraceIndirectDispatch(
+    uint32_t                               deviceIdx,
+    GpuRt::RtPipelineType                  pipelineType,
+    uint32_t                               shaderCount,
+    uint64_t                               apiHash,
+    const VkStridedDeviceAddressRegionKHR* pRaygenSbt,
+    const VkStridedDeviceAddressRegionKHR* pMissSbt,
+    const VkStridedDeviceAddressRegionKHR* pHitSbt,
+    Pal::gpusize*                          pCounterMetadataVa,
+    GpuRt::InitExecuteIndirectConstants*   pConstants)
+{
+    if (m_pGpuRtDevice[deviceIdx]->RayHistoryTraceActive())
+    {
+        GpuRt::RtDispatchInfo dispatchInfo = {};
+        SetDispatchInfo(pipelineType,
+                        0,
+                        0,
+                        0,
+                        shaderCount,
+                        apiHash,
+                        pRaygenSbt,
+                        pMissSbt,
+                        pHitSbt,
+                        &dispatchInfo);
+
+        m_pGpuRtDevice[deviceIdx]->TraceIndirectRtDispatch(pipelineType,
+                                                           dispatchInfo,
+                                                           1,
+                                                           pCounterMetadataVa,
+                                                           pConstants);
+    }
+}
+
 // =====================================================================================================================
 // Compile one of gpurt's internal pipelines.
 Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline(
@@ -657,7 +803,7 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline(
                                                         static_cast<const uint8_t*>(buildInfo.code.pSpvCode),
                                                         buildInfo.nodeCount,
                                                         nodes,
-                                                        VK_SHADER_MODULE_RAY_TRACING_INTERNAL_SHADER_BIT,
+                                                        VK_INTERNAL_SHADER_FLAGS_RAY_TRACING_INTERNAL_SHADER_BIT,
                                                         forceWave64,
                                                         &specializationInfo,
                                                         &pDevice->GetInternalRayTracingPipeline());
diff --git a/icd/api/raytrace/ray_tracing_device.h b/icd/api/raytrace/ray_tracing_device.h
index 9a0c27b3..f13ce5b9 100644
--- a/icd/api/raytrace/ray_tracing_device.h
+++ b/icd/api/raytrace/ray_tracing_device.h
@@ -85,6 +85,33 @@ class RayTracingDevice
     uint32_t GetProfileRayFlags() const { return m_profileRayFlags; }
     uint32_t GetProfileMaxIterations() const { return m_profileMaxIterations; }
 
+    GpuRt::TraceRayCounterMode TraceRayCounterMode(uint32_t deviceIdx) const;
+
+    void TraceDispatch(
+        uint32_t                               deviceIdx,
+        Pal::ICmdBuffer*                       pPalCmdBuffer,
+        GpuRt::RtPipelineType                  pipelineType,
+        uint32_t                               width,
+        uint32_t                               height,
+        uint32_t                               depth,
+        uint32_t                               shaderCount,
+        uint64_t                               apiHash,
+        const VkStridedDeviceAddressRegionKHR* pRaygenSbt,
+        const VkStridedDeviceAddressRegionKHR* pMissSbt,
+        const VkStridedDeviceAddressRegionKHR* pHitSbt,
+        GpuRt::DispatchRaysConstants*          pConstants);
+
+    void TraceIndirectDispatch(
+        uint32_t                               deviceIdx,
+        GpuRt::RtPipelineType                  pipelineType,
+        uint32_t                               shaderCount,
+        uint64_t                               apiHash,
+        const VkStridedDeviceAddressRegionKHR* pRaygenSbt,
+        const VkStridedDeviceAddressRegionKHR* pMissSbt,
+        const VkStridedDeviceAddressRegionKHR* pHitSbt,
+        Pal::gpusize*                          pCounterMetadataVa,
+        GpuRt::InitExecuteIndirectConstants*   pConstants);
+
 private:
     Device*                         m_pDevice;
 
@@ -145,6 +172,18 @@ class RayTracingDevice
         const GpuRt::DeviceInitInfo&    initInfo,
         ClientGpuMemHandle              gpuMem);
 
+    void SetDispatchInfo(
+        GpuRt::RtPipelineType                  pipelineType,
+        uint32_t                               width,
+        uint32_t                               height,
+        uint32_t                               depth,
+        uint32_t                               shaderCount,
+        uint64_t                               apiHash,
+        const VkStridedDeviceAddressRegionKHR* pRaygenSbt,
+        const VkStridedDeviceAddressRegionKHR* pMissSbt,
+        const VkStridedDeviceAddressRegionKHR* pHitSbt,
+        GpuRt::RtDispatchInfo*                 pDispatchInfo) const;
+
     AccelStructTrackerResources     m_accelStructTrackerResources[MaxPalDevices];
 };
 
diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp
index b8f7e8d2..f1b9ce98 100644
--- a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp
+++ b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp
@@ -463,6 +463,7 @@ VkResult RayTracingPipeline::CreateImpl(
         BuildApiHash(pCreateInfo, &elfHash, &apiPsoHash);
 
         binaryCreateInfo.pDeferredWorkload = pDeferredWorkload;
+        binaryCreateInfo.apiPsoHash        = apiPsoHash;
 
         const VkPipelineCreationFeedbackCreateInfoEXT* pPipelineCreationFeedbackCreateInfo = nullptr;
         pDefaultCompiler->GetPipelineCreationFeedback(static_cast<const VkStructHeader*>(pCreateInfo->pNext),
@@ -1335,7 +1336,8 @@ VkResult RayTracingPipeline::CreateImpl(
 
         if ((result == VK_SUCCESS) && m_pDevice->IsExtensionEnabled(DeviceExtensions::AMD_SHADER_INFO))
         {
-            pBinary = PipelineBinaryInfo::Create(pipelineBinary[DefaultDeviceIndex].pPipelineBins[0].codeSize,
+            pBinary = PipelineBinaryInfo::Create(cacheId[DefaultDeviceIndex],
+                                                 pipelineBinary[DefaultDeviceIndex].pPipelineBins[0].codeSize,
                                                  pipelineBinary[DefaultDeviceIndex].pPipelineBins[0].pCode,
                                                  pAllocator);
         }
@@ -1948,22 +1950,24 @@ void RayTracingPipeline::ConvertStaticPipelineFlags(
     uint32_t*     pStaticFlags,
     uint32_t*     pTriangleCompressMode,
     uint32_t*     pCounterMode,
-    uint32_t      pipelineFlags)
+    uint32_t      pipelineFlags
+)
 {
-    const RuntimeSettings& settings = pDevice->GetRuntimeSettings();
+    const RuntimeSettings& settings        = pDevice->GetRuntimeSettings();
+    GpuRt::TraceRayCounterMode counterMode = pDevice->RayTrace()->TraceRayCounterMode(DefaultDeviceIndex);
 
     uint32_t staticFlags = pDevice->RayTrace()->GpuRt(DefaultDeviceIndex)->GetStaticPipelineFlags(
         Util::TestAnyFlagSet(pipelineFlags, VK_PIPELINE_CREATE_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR),
         Util::TestAnyFlagSet(pipelineFlags, VK_PIPELINE_CREATE_RAY_TRACING_SKIP_AABBS_BIT_KHR),
         settings.rtUseRayQueryForTraceRays,
         pDevice->RayTrace()->AccelStructTrackerEnabled(DefaultDeviceIndex),
-        (settings.rtTraceRayCounterMode != TraceRayCounterMode::TraceRayCounterDisable));
+        (counterMode != GpuRt::TraceRayCounterMode::TraceRayCounterDisable));
 
     *pStaticFlags = staticFlags;
 
     *pTriangleCompressMode = static_cast<uint32_t>(ConvertGpuRtTriCompressMode(settings.rtTriangleCompressionMode));
 
-    *pCounterMode = settings.rtTraceRayCounterMode;
+    *pCounterMode = static_cast<uint32_t>(counterMode);
 }
 
 // =====================================================================================================================
@@ -2142,7 +2146,6 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetRayTracingShaderGroupHandlesKHR(
 {
     RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pipeline);
 
-    // #raytracing: MGPU support - Return based on DefaultDeviceIndex since the result shouldn't vary between GPUs.
     pPipeline->GetRayTracingShaderGroupHandles(DefaultDeviceIndex, firstGroup, groupCount, dataSize, pData);
 
     return VK_SUCCESS;
diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.h b/icd/api/raytrace/vk_ray_tracing_pipeline.h
index 6421297c..757e7830 100644
--- a/icd/api/raytrace/vk_ray_tracing_pipeline.h
+++ b/icd/api/raytrace/vk_ray_tracing_pipeline.h
@@ -215,7 +215,8 @@ class RayTracingPipeline final : public Pipeline, public NonDispatchable<VkPipel
                                            uint32_t* pStaticFlags,
                                            uint32_t* pTriangleCompressMode,
                                            uint32_t* pCounterMode,
-                                           uint32_t  pipelineFlags);
+                                           uint32_t  pipelineFlags
+);
 
     void GetDispatchSize(uint32_t* pDispatchSizeX,
                          uint32_t* pDispatchSizeY,
diff --git a/icd/api/strings/entry_points.txt b/icd/api/strings/entry_points.txt
index e83bc79c..d97cf5d4 100644
--- a/icd/api/strings/entry_points.txt
+++ b/icd/api/strings/entry_points.txt
@@ -548,3 +548,4 @@ vkCmdSetLineStippleEnableEXT                        @device       @dext(EXT_exte
 vkCmdSetDepthClipNegativeOneToOneEXT                @device       @dext(EXT_extended_dynamic_state3)
 
 vkCmdSetVertexInputEXT                              @device       @dext(EXT_vertex_input_dynamic_state)
+
diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt
index 6e424610..e60c1c87 100644
--- a/icd/api/strings/extensions.txt
+++ b/icd/api/strings/extensions.txt
@@ -189,6 +189,8 @@ VK_EXT_shader_module_identifier
 VK_EXT_extended_dynamic_state3
 VK_KHR_map_memory2
 VK_EXT_vertex_input_dynamic_state
+VK_EXT_device_address_binding_report
 VK_EXT_device_fault
 VK_EXT_attachment_feedback_loop_layout
 VK_EXT_physical_device_drm
+VK_EXT_texture_compression_astc_hdr
diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp
index a4708187..707d3eaa 100644
--- a/icd/api/vk_cmdbuffer.cpp
+++ b/icd/api/vk_cmdbuffer.cpp
@@ -995,9 +995,18 @@ void CmdBuffer::PalCmdBufferDestroy()
 void CmdBuffer::PalCmdBindIndexData(
     Buffer* pBuffer,
     Pal::gpusize offset,
-    Pal::IndexType indexType)
+    Pal::IndexType indexType,
+    Pal::gpusize bufferSize)
 {
-    const uint32_t indexCount = utils::BufferSizeToIndexCount(indexType, pBuffer->GetSize() - offset);
+    uint32_t indexCount = 0;
+    if (bufferSize == VK_WHOLE_SIZE)
+    {
+        indexCount = utils::BufferSizeToIndexCount(indexType, pBuffer->GetSize() - offset);
+    }
+    else
+    {
+        indexCount = utils::BufferSizeToIndexCount(indexType, bufferSize - offset);
+    }
 
     utils::IterateMask deviceGroup(m_curDeviceMask);
     do
@@ -1685,13 +1694,32 @@ VkResult CmdBuffer::Begin(
         Pal::GlobalScissorParams scissorParams = { };
         scissorParams.scissorRegion.extent.width  = limits.maxFramebufferWidth;
         scissorParams.scissorRegion.extent.height = limits.maxFramebufferHeight;
-        utils::IterateMask deviceGroup(GetDeviceMask());
-        do
         {
-            const uint32_t deviceIdx = deviceGroup.Index();
-            PalCmdBuffer(deviceIdx)->CmdSetGlobalScissor(scissorParams);
+            utils::IterateMask deviceGroup(GetDeviceMask());
+            do
+            {
+                const uint32_t deviceIdx = deviceGroup.Index();
+                PalCmdBuffer(deviceIdx)->CmdSetGlobalScissor(scissorParams);
+            }
+            while (deviceGroup.IterateNext());
+        }
+
+        if (m_pDevice->GetEnabledFeatures().initializePointSizeInBegin)
+        {
+            m_allGpuState.staticTokens.pointLineRasterState = DynamicRenderStateToken;
+            const Pal::PointLineRasterStateParams params = { DefaultPointSize,
+                                                             0.0f, // Default line width is zero
+                                                             limits.pointSizeRange[0],
+                                                             limits.pointSizeRange[1] };
+
+            utils::IterateMask deviceGroup(GetDeviceMask());
+            do
+            {
+                const uint32_t deviceIdx = deviceGroup.Index();
+                PalCmdBuffer(deviceIdx)->CmdSetPointLineRasterState(params);
+            }
+            while (deviceGroup.IterateNext());
         }
-        while (deviceGroup.IterateNext());
 
         const uint32_t supportedVrsRates = deviceProps.gfxipProperties.supportedVrsRates;
 
@@ -2369,7 +2397,8 @@ void CmdBuffer::RebindUserData(
                 userDataLayout.uberFetchConstBufRegBase,
                 2,
                 reinterpret_cast<uint32_t*>(&gpuAddress));
-        } while (deviceGroup.IterateNext());
+        }
+        while (deviceGroup.IterateNext());
     }
 }
 
@@ -2998,6 +3027,7 @@ PFN_vkCmdPushDescriptorSetWithTemplateKHR CmdBuffer::GetCmdPushDescriptorSetWith
 void CmdBuffer::BindIndexBuffer(
     VkBuffer     buffer,
     VkDeviceSize offset,
+    VkDeviceSize size,
     VkIndexType  indexType)
 {
     DbgBarrierPreCmd(DbgBarrierBindIndexVertexBuffer);
@@ -3007,7 +3037,7 @@ void CmdBuffer::BindIndexBuffer(
 
     if (pBuffer != NULL)
     {
-        PalCmdBindIndexData(pBuffer, offset, palIndexType);
+        PalCmdBindIndexData(pBuffer, offset, palIndexType, size);
     }
     else
     {
@@ -3117,7 +3147,8 @@ void CmdBuffer::BindVertexBuffers(
 
             PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(
                 firstBinding, bindingCount, &PerGpuState(deviceIdx)->vbBindings[firstBinding]);
-        } while (deviceGroup.IterateNext());
+        }
+        while (deviceGroup.IterateNext());
 
         m_vbWatermark = Util::Max(m_vbWatermark, firstBinding + bindingCount);
 
@@ -3194,7 +3225,7 @@ void CmdBuffer::Draw(
     ValidateGraphicsStates();
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics);
+    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0);
 #endif
 
     {
@@ -3221,7 +3252,7 @@ void CmdBuffer::DrawIndexed(
     ValidateGraphicsStates();
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics);
+    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0);
 #endif
 
     {
@@ -3251,7 +3282,7 @@ void CmdBuffer::DrawIndirect(
     ValidateGraphicsStates();
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics);
+    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0);
 #endif
 
     Buffer* pBuffer = Buffer::ObjectFromHandle(buffer);
@@ -3309,7 +3340,7 @@ void CmdBuffer::DrawMeshTasks(
     ValidateGraphicsStates();
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics);
+    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0);
 #endif
 
     PalCmdDrawMeshTasks(x, y, z);
@@ -3332,7 +3363,7 @@ void CmdBuffer::DrawMeshTasksIndirect(
     ValidateGraphicsStates();
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics);
+    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0);
 #endif
 
     PalCmdDrawMeshTasksIndirect<useBufferCount>(buffer, offset, count, stride, countBuffer, countOffset);
@@ -3354,7 +3385,7 @@ void CmdBuffer::Dispatch(
     }
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute);
+    BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, x, y, z);
 #endif
 
     if (m_pDevice->GetRuntimeSettings().enableAlternatingThreadGroupOrder)
@@ -3384,7 +3415,7 @@ void CmdBuffer::DispatchOffset(
     }
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute);
+    BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, dim_x, dim_y, dim_z);
 #endif
 
     PalCmdDispatchOffset(base_x, base_y, base_z, dim_x, dim_y, dim_z);
@@ -4924,59 +4955,61 @@ void CmdBuffer::LoadOpClearColor(
         {
             // Get the image view from the attachment info
             const ImageView* const pImageView = ImageView::ObjectFromHandle(attachmentInfo.imageView);
+            if (pImageView != VK_NULL_HANDLE)
+            {
+                // Get the attachment image
+                const Image* pImage = pImageView->GetImage();
 
-            // Get the attachment image
-            const Image* pImage = pImageView->GetImage();
-
-            // Convert the clear color to the format of the attachment view
-            Pal::SwizzledFormat clearFormat = VkToPalFormat(
-                pImageView->GetViewFormat(),
-                m_pDevice->GetRuntimeSettings());
-            Pal::ClearColor clearColor = VkToPalClearColor(
-                attachmentInfo.clearValue.color,
-                clearFormat);
+                // Convert the clear color to the format of the attachment view
+                Pal::SwizzledFormat clearFormat = VkToPalFormat(
+                    pImageView->GetViewFormat(),
+                    m_pDevice->GetRuntimeSettings());
+                Pal::ClearColor clearColor = VkToPalClearColor(
+                    attachmentInfo.clearValue.color,
+                    clearFormat);
 
-             // Get subres range from the image view
-             Pal::SubresRange subresRange = {};
-             pImageView->GetFrameBufferAttachmentSubresRange(&subresRange);
+                // Get subres range from the image view
+                Pal::SubresRange subresRange = {};
+                pImageView->GetFrameBufferAttachmentSubresRange(&subresRange);
 
-             // Override the number of slices with layerCount from pBeginRendering
-             subresRange.numSlices = pRenderingInfo->layerCount;
+                // Override the number of slices with layerCount from pBeginRendering
+                subresRange.numSlices = pRenderingInfo->layerCount;
 
-            const auto clearSubresRanges = LoadOpClearSubresRanges(
-                pRenderingInfo->viewMask,
-                subresRange);
+                const auto clearSubresRanges = LoadOpClearSubresRanges(
+                    pRenderingInfo->viewMask,
+                    subresRange);
 
-            // Clear Layout
-            const Pal::ImageLayout clearLayout = pImage->GetBarrierPolicy().GetAspectLayout(
-                attachmentInfo.imageLayout,
-                subresRange.startSubres.plane,
-                GetQueueFamilyIndex(),
-                pImage->GetFormat());
+                // Clear Layout
+                const Pal::ImageLayout clearLayout = pImage->GetBarrierPolicy().GetAspectLayout(
+                    attachmentInfo.imageLayout,
+                    subresRange.startSubres.plane,
+                    GetQueueFamilyIndex(),
+                    pImage->GetFormat());
 
-            utils::IterateMask deviceGroup(GetDeviceMask());
-
-            do
-            {
-                const uint32_t deviceIdx = deviceGroup.Index();
+                utils::IterateMask deviceGroup(GetDeviceMask());
 
-                // Clear Box
-                Pal::Box clearBox = BuildClearBox(
-                    pDeviceGroupRenderArea[deviceIdx],
-                    *pImageView);
+                do
+                {
+                    const uint32_t deviceIdx = deviceGroup.Index();
 
-                PalCmdBuffer(deviceIdx)->CmdClearColorImage(
-                    *pImage->PalImage(deviceIdx),
-                    clearLayout,
-                    clearColor,
-                    clearFormat,
-                    clearSubresRanges.NumElements(),
-                    clearSubresRanges.Data(),
-                    1,
-                    &clearBox,
-                    Pal::ColorClearAutoSync);
+                    // Clear Box
+                    Pal::Box clearBox = BuildClearBox(
+                        pDeviceGroupRenderArea[deviceIdx],
+                        *pImageView);
+
+                    PalCmdBuffer(deviceIdx)->CmdClearColorImage(
+                        *pImage->PalImage(deviceIdx),
+                        clearLayout,
+                        clearColor,
+                        clearFormat,
+                        clearSubresRanges.NumElements(),
+                        clearSubresRanges.Data(),
+                        1,
+                        &clearBox,
+                        Pal::ColorClearAutoSync);
+                }
+                while (deviceGroup.IterateNext());
             }
-            while (deviceGroup.IterateNext());
         }
     }
 }
@@ -5007,19 +5040,22 @@ void CmdBuffer::LoadOpClearDepthStencil(
     {
         const ImageView* const pStencilImageView = ImageView::ObjectFromHandle(pStencilAttachmentInfo->imageView);
 
-        pDepthStencilImage = pStencilImageView->GetImage();
+        if (pStencilImageView != VK_NULL_HANDLE)
+        {
+            pDepthStencilImage = pStencilImageView->GetImage();
 
-        GetImageLayout(
-            pStencilAttachmentInfo->imageView,
-            pStencilAttachmentInfo->imageLayout,
-            VK_IMAGE_ASPECT_STENCIL_BIT,
-            &subresRange,
-            &stencilLayout);
+            GetImageLayout(
+                pStencilAttachmentInfo->imageView,
+                pStencilAttachmentInfo->imageLayout,
+                VK_IMAGE_ASPECT_STENCIL_BIT,
+                &subresRange,
+                &stencilLayout);
 
-        if (pStencilAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
-        {
-            clearSubresRanges.PushBack(subresRange);
-            clearStencil = pStencilAttachmentInfo->clearValue.depthStencil.stencil;
+            if (pStencilAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
+            {
+                clearSubresRanges.PushBack(subresRange);
+                clearStencil = pStencilAttachmentInfo->clearValue.depthStencil.stencil;
+            }
         }
     }
 
@@ -5028,19 +5064,22 @@ void CmdBuffer::LoadOpClearDepthStencil(
     {
         const ImageView* const pDepthImageView = ImageView::ObjectFromHandle(pDepthAttachmentInfo->imageView);
 
-        pDepthStencilImage = pDepthImageView->GetImage();
+        if (pDepthImageView != VK_NULL_HANDLE)
+        {
+            pDepthStencilImage = pDepthImageView->GetImage();
 
-        GetImageLayout(
-            pDepthAttachmentInfo->imageView,
-            pDepthAttachmentInfo->imageLayout,
-            VK_IMAGE_ASPECT_DEPTH_BIT,
-            &subresRange,
-            &depthLayout);
+            GetImageLayout(
+                pDepthAttachmentInfo->imageView,
+                pDepthAttachmentInfo->imageLayout,
+                VK_IMAGE_ASPECT_DEPTH_BIT,
+                &subresRange,
+                &depthLayout);
 
-        if (pDepthAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
-        {
-            clearSubresRanges.PushBack(subresRange);
-            clearDepth = pDepthAttachmentInfo->clearValue.depthStencil.depth;
+            if (pDepthAttachmentInfo->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR)
+            {
+                clearSubresRanges.PushBack(subresRange);
+                clearDepth = pDepthAttachmentInfo->clearValue.depthStencil.depth;
+            }
         }
     }
     else
@@ -7033,7 +7072,8 @@ void CmdBuffer::ResetAccelerationStructureQueryPool(
             accelerationStructureQueryPool.GetSlotOffset(firstQuery),
             accelerationStructureQueryPool.GetSlotSize() * queryCount,
             0);
-    } while (deviceGroup1.IterateNext());
+    }
+    while (deviceGroup1.IterateNext());
 
     // Wait for memory fill to complete
     {
@@ -7831,7 +7871,8 @@ void CmdBuffer::QueryCopy(
         PalCmdBuffer(deviceIdx)->CmdRestoreComputeState(Pal::ComputeStatePipelineAndUserData);
 
         // Note that the application is responsible for doing a post-copy sync using a barrier.
-    } while (deviceGroup.IterateNext());
+    }
+    while (deviceGroup.IterateNext());
 }
 
 // =====================================================================================================================
@@ -7899,10 +7940,6 @@ void CmdBuffer::WriteTimestamp(
 void CmdBuffer::SetSampleLocations(
     const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
 {
-    VK_ASSERT((m_allGpuState.pGraphicsPipeline != nullptr) &&
-              (m_allGpuState.pGraphicsPipeline->ContainsStaticState(
-                    DynamicStatesInternal::SampleLocations) == false));
-
     uint32_t sampleLocationsPerPixel = static_cast<uint32_t>(pSampleLocationsInfo->sampleLocationsPerPixel);
 
     if (sampleLocationsPerPixel > 0)
@@ -8433,26 +8470,21 @@ void CmdBuffer::RPSyncPointLegacy(
 
                     const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr;
 
-                    const uint32_t sampleCount = attachment.pImage->GetImageSamples();
-
-                    if (sampleCount > 0)
+                    if (attachment.pImage->IsSampleLocationsCompatibleDepth() &&
+                        tr.flags.isInitialLayoutTransition)
                     {
-                        if (attachment.pImage->IsSampleLocationsCompatibleDepth() &&
-                            tr.flags.isInitialLayoutTransition)
-                        {
-                            VK_ASSERT(attachment.pImage->HasDepth());
+                        VK_ASSERT(attachment.pImage->HasDepth());
 
-                            // Use the provided sample locations for this attachment if this is its
-                            // initial layout transition
-                            pQuadSamplePattern =
-                                &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations;
-                        }
-                        else
-                        {
-                            // Otherwise, use the subpass' sample locations
-                            uint32_t subpass   = m_renderPassInstance.subpass;
-                            pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations;
-                        }
+                        // Use the provided sample locations for this attachment if this is its
+                        // initial layout transition
+                        pQuadSamplePattern =
+                            &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations;
+                    }
+                    else
+                    {
+                        // Otherwise, use the subpass' sample locations
+                        uint32_t subpass   = m_renderPassInstance.subpass;
+                        pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations;
                     }
 
                     pLayoutTransition->imageInfo.pQuadSamplePattern = pQuadSamplePattern;
@@ -8620,26 +8652,21 @@ void CmdBuffer::RPSyncPoint(
 
                         const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr;
 
-                        const uint32_t sampleCount = attachment.pImage->GetImageSamples();
+                        if (attachment.pImage->IsSampleLocationsCompatibleDepth() &&
+                            tr.flags.isInitialLayoutTransition)
+                        {
+                            VK_ASSERT(attachment.pImage->HasDepth());
 
-                        if (sampleCount > 0)
+                            // Use the provided sample locations for this attachment if this is its
+                            // initial layout transition
+                            pQuadSamplePattern =
+                                &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations;
+                        }
+                        else
                         {
-                             if (attachment.pImage->IsSampleLocationsCompatibleDepth() &&
-                                 tr.flags.isInitialLayoutTransition)
-                             {
-                                 VK_ASSERT(attachment.pImage->HasDepth());
-
-                                 // Use the provided sample locations for this attachment if this is its
-                                 // initial layout transition
-                                 pQuadSamplePattern =
-                                     &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations;
-                             }
-                             else
-                             {
-                                 // Otherwise, use the subpass' sample locations
-                                 uint32_t subpass   = m_renderPassInstance.subpass;
-                                 pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations;
-                             }
+                            // Otherwise, use the subpass' sample locations
+                            uint32_t subpass   = m_renderPassInstance.subpass;
+                            pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations;
                         }
 
                         pPalTransitions[acquireReleaseInfo.imageBarrierCount].pQuadSamplePattern = pQuadSamplePattern;
@@ -10110,10 +10137,9 @@ void CmdBuffer::SetLineWidth(
 {
     DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState);
 
-    constexpr float PointWidth           = 1.0f;    // gl_PointSize is arbitrary, elsewhere pointSize is 1.0
     const VkPhysicalDeviceLimits& limits = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetLimits();
 
-    const Pal::PointLineRasterStateParams params = { PointWidth,
+    const Pal::PointLineRasterStateParams params = { DefaultPointSize,
                                                      lineWidth,
                                                      limits.pointSizeRange[0],
                                                      limits.pointSizeRange[1] };
@@ -10310,13 +10336,17 @@ void CmdBuffer::SetVertexInput(
             pUberFetchShaderInternalData = Util::VoidPtrInc(pUberFetchShaderInternalData, uberFetchShaderInternalDataSize);
 
             // Updat vertex buffer stride
-            uint32 firstChanged = UINT_MAX;
-            uint32 lastChanged = 0;
+            uint32_t firstChanged = UINT_MAX;
+            uint32_t lastChanged = 0;
+            uint32_t vertexBufferCount = 0;
             Pal::BufferViewInfo* pVbBindings = PerGpuState(deviceIdx)->vbBindings;
-            for (uint32 bindex = 0; bindex < vertexBindingDescriptionCount; ++bindex)
+            for (uint32_t bindex = 0; bindex < vertexBindingDescriptionCount; ++bindex)
             {
-                uint32 byteStride = pVertexBindingDescriptions[bindex].stride;
-                uint32 binding = pVertexBindingDescriptions[bindex].binding;
+                uint32_t byteStride = pVertexBindingDescriptions[bindex].stride;
+                uint32_t binding = pVertexBindingDescriptions[bindex].binding;
+
+                vertexBufferCount = Util::Max(binding + 1, vertexBufferCount);
+
                 Pal::BufferViewInfo* pBinding = &pVbBindings[binding];
 
                 if (pBinding->stride != byteStride)
@@ -10343,7 +10373,14 @@ void CmdBuffer::SetVertexInput(
                     &PerGpuState(deviceIdx)->vbBindings[firstChanged]);
             }
 
-        } while (deviceGroup.IterateNext());
+            if (vertexBufferCount != pBindState->dynamicBindInfo.gfx.dynamicState.vertexBufferCount)
+            {
+                pBindState->dynamicBindInfo.gfx.dynamicState.vertexBufferCount = vertexBufferCount;
+                m_allGpuState.dirtyGraphics.pipeline = 1;
+            }
+
+        }
+        while (deviceGroup.IterateNext());
     }
 }
 
@@ -10623,7 +10660,7 @@ void CmdBuffer::DrawIndirectByteCount(
     ValidateGraphicsStates();
 
 #if VKI_RAY_TRACING
-    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics);
+    BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0);
 #endif
 
     utils::IterateMask deviceGroup(m_curDeviceMask);
@@ -11142,7 +11179,22 @@ void CmdBuffer::GetRayTracingDispatchArgs(
     static_assert(uint32_t(GpuRt::TraceRayCounterDispatch) == uint32_t(TraceRayCounterDispatch),
                   "Wrong enum value, TraceRayCounterDispatch != GpuRt::TraceRayCounterDispatch");
 
-    pConstants->constData.counterMode = static_cast<GpuRt::TraceRayCounterMode>(settings.rtTraceRayCounterMode);
+    if (width > 0)
+    {
+        // Populate internalUavBufferSrd only for direct dispatches (where width, height, and depth are known)
+        m_pDevice->RayTrace()->TraceDispatch(deviceIdx,
+                                             PalCmdBuffer(deviceIdx),
+                                             GpuRt::RtPipelineType::RayTracing,
+                                             width,
+                                             height,
+                                             depth,
+                                             pPipeline->GetShaderGroupCount() + 1,
+                                             pPipeline->GetApiHash(),
+                                             &raygenSbt,
+                                             &missSbt,
+                                             &hitSbt,
+                                             pConstants);
+    }
 
 }
 
@@ -11327,6 +11379,16 @@ void CmdBuffer::TraceRaysIndirectPerDevice(
     initUserData.outputBufferVa    = pScratchMemory->GpuVirtAddr(deviceIdx);
     initUserData.outputConstantsVa = constants.descriptorTable.dispatchRaysConstGpuVa;
 
+    m_pDevice->RayTrace()->TraceIndirectDispatch(deviceIdx,
+                                                 GpuRt::RtPipelineType::RayTracing,
+                                                 pPipeline->GetShaderGroupCount() + 1,
+                                                 pPipeline->GetApiHash(),
+                                                 &raygenShaderBindingTable,
+                                                 &missShaderBindingTable,
+                                                 &hitShaderBindingTable,
+                                                 &initUserData.outputCounterMetaVa,
+                                                 pInitConstants);
+
     m_pDevice->RayTrace()->GpuRt(deviceIdx)->InitExecuteIndirect(PalCmdBuffer(deviceIdx), initUserData, 1, 1);
 
     // Wait for the argument buffer to be populated before continuing with TraceRaysIndirect
@@ -11474,7 +11536,10 @@ void CmdBuffer::SetRayTracingPipelineStackSize(
 // Setup internal constants and descriptors required for shaders using RayQuery
 void CmdBuffer::BindRayQueryConstants(
     const Pipeline*        pPipeline,
-    Pal::PipelineBindPoint bindPoint)
+    Pal::PipelineBindPoint bindPoint,
+    uint32_t               width,
+    uint32_t               height,
+    uint32_t               depth)
 {
     if ((pPipeline != nullptr) && pPipeline->HasRayTracing())
     {
@@ -11494,6 +11559,23 @@ void CmdBuffer::BindRayQueryConstants(
                        VkDevice()->RayTrace()->GetAccelStructTrackerSrd(deviceIdx),
                        sizeof(constants.descriptorTable.accelStructTrackerSrd));
 
+                if (bindPoint == Pal::PipelineBindPoint::Compute)
+                {
+                    // Ray history dumps for Graphics pipelines are not yet supported
+                    m_pDevice->RayTrace()->TraceDispatch(deviceIdx,
+                                                         PalCmdBuffer(deviceIdx),
+                                                         GpuRt::RtPipelineType::Compute,
+                                                         width,
+                                                         height,
+                                                         depth,
+                                                         1,
+                                                         pPipeline->GetApiHash(),
+                                                         nullptr,
+                                                         nullptr,
+                                                         nullptr,
+                                                         &constants);
+                }
+
                 Pal::ICmdBuffer* pPalCmdBuffer = PalCmdBuffer(deviceIdx);
                 Pal::gpusize     constGpuAddr  = 0;
 
@@ -11622,7 +11704,8 @@ void CmdBuffer::BindDescriptorBufferEmbeddedSamplers(
 
             PerGpuState(deviceIdx)->setBindingData[apiBindPoint][setLayoutInfo.setPtrRegOffset] =
                 static_cast<uint32_t>(gpuAddr);
-        } while (deviceGroup.IterateNext());
+        }
+        while (deviceGroup.IterateNext());
 
         SetUserDataPipelineLayout(set, 1, pLayout, palBindPoint, apiBindPoint);
     }
@@ -11642,6 +11725,67 @@ void CmdBuffer::ValidateGraphicsStates()
         {
             const uint32_t deviceIdx = deviceGroup.Index();
 
+            if (m_allGpuState.dirtyGraphics.colorBlend)
+            {
+                DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState);
+
+                RenderStateCache* pRSCache = m_pDevice->GetRenderStateCache();
+
+                if (pColorBlend == nullptr)
+                {
+                    DynamicColorBlend colorBlend = {};
+
+                    pRSCache->CreateColorBlendState(m_allGpuState.colorBlendCreateInfo,
+                        m_pDevice->VkInstance()->GetAllocCallbacks(),
+                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
+                        colorBlend.pPalColorBlend);
+
+                    // Check if pPalColorBlend is already in the m_palColorBlendState, destroy it and use the old one
+                    // if yes.The destroy is not expensive since it's just a refCount--.
+                    for (uint32_t i = 0; i < m_palColorBlendState.NumElements(); ++i)
+                    {
+                        const DynamicColorBlend& palColorBlendState = m_palColorBlendState.At(i);
+
+                        // Check device0 only should be sufficient
+                        if (palColorBlendState.pPalColorBlend[0] == colorBlend.pPalColorBlend[0])
+                        {
+                            pRSCache->DestroyColorBlendState(colorBlend.pPalColorBlend,
+                                m_pDevice->VkInstance()->GetAllocCallbacks());
+
+                            pColorBlend = &palColorBlendState;
+                            break;
+                        }
+                    }
+
+                    // Add it to the m_palColorBlendState if it doesn't exist
+                    if (pColorBlend == nullptr)
+                    {
+                        m_palColorBlendState.PushBack(colorBlend);
+                        pColorBlend = &m_palColorBlendState.Back();
+                    }
+                }
+
+                VK_ASSERT(pColorBlend != nullptr);
+
+                PalCmdBindColorBlendState(
+                    m_pPalCmdBuffers[deviceIdx],
+                    deviceIdx,
+                    pColorBlend->pPalColorBlend[deviceIdx]);
+
+                bool dualSourceBlendEnable = m_pDevice->PalDevice(DefaultDeviceIndex)->CanEnableDualSourceBlend(
+                    m_allGpuState.colorBlendCreateInfo);
+
+                auto pDynamicState =
+                    &m_allGpuState.pipelineState[PipelineBindGraphics].dynamicBindInfo.gfx.dynamicState;
+                if (dualSourceBlendEnable != pDynamicState->dualSourceBlendEnable)
+                {
+                    pDynamicState->dualSourceBlendEnable = dualSourceBlendEnable;
+                    m_allGpuState.dirtyGraphics.pipeline = 1;
+                }
+
+                DbgBarrierPostCmd(DbgBarrierSetDynamicPipelineState);
+            }
+
             if (m_allGpuState.dirtyGraphics.pipeline)
             {
                 const GraphicsPipeline* pGraphicsPipeline = m_allGpuState.pGraphicsPipeline;
@@ -11795,56 +11939,6 @@ void CmdBuffer::ValidateGraphicsStates()
                         *Device::GetDefaultQuadSamplePattern(m_allGpuState.samplePattern.sampleCount));
             }
 
-            if (m_allGpuState.dirtyGraphics.colorBlend)
-            {
-                DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState);
-
-                RenderStateCache* pRSCache = m_pDevice->GetRenderStateCache();
-
-                if (pColorBlend == nullptr)
-                {
-                    DynamicColorBlend colorBlend = {};
-
-                    pRSCache->CreateColorBlendState(m_allGpuState.colorBlendCreateInfo,
-                        m_pDevice->VkInstance()->GetAllocCallbacks(),
-                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
-                        colorBlend.pPalColorBlend);
-
-                    // Check if pPalColorBlend is already in the m_palColorBlendState, destroy it and use the old one
-                    // if yes.The destroy is not expensive since it's just a refCount--.
-                    for (uint32_t i = 0; i < m_palColorBlendState.NumElements(); ++i)
-                    {
-                        const DynamicColorBlend& palColorBlendState = m_palColorBlendState.At(i);
-
-                        // Check device0 only should be sufficient
-                        if (palColorBlendState.pPalColorBlend[0] == colorBlend.pPalColorBlend[0])
-                        {
-                            pRSCache->DestroyColorBlendState(colorBlend.pPalColorBlend,
-                                m_pDevice->VkInstance()->GetAllocCallbacks());
-
-                            pColorBlend = &palColorBlendState;
-                            break;
-                        }
-                    }
-
-                    // Add it to the m_palColorBlendState if it doesn't exist
-                    if (pColorBlend == nullptr)
-                    {
-                        m_palColorBlendState.PushBack(colorBlend);
-                        pColorBlend = &m_palColorBlendState.Back();
-                    }
-                }
-
-                VK_ASSERT(pColorBlend != nullptr);
-
-                PalCmdBindColorBlendState(
-                    m_pPalCmdBuffers[deviceIdx],
-                    deviceIdx,
-                    pColorBlend->pPalColorBlend[deviceIdx]);
-
-                DbgBarrierPostCmd(DbgBarrierSetDynamicPipelineState);
-            }
-
             if (m_allGpuState.dirtyGraphics.msaa)
             {
                 DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState);
@@ -11916,7 +12010,7 @@ void CmdBuffer::ValidateSamplePattern(
         {
             const Pal::MsaaQuadSamplePattern* pLocations;
 
-            if (pSamplePattern != nullptr && (pSamplePattern->sampleCount > 0))
+            if ((pSamplePattern != nullptr) && (pSamplePattern->sampleCount > 0))
             {
                 VK_ASSERT(sampleCount == pSamplePattern->sampleCount);
 
@@ -12202,28 +12296,6 @@ void CmdBuffer::SetColorBlendEnable(
     }
 }
 
-// =====================================================================================================================
-// Returns true if the given VkBlendFactor factor is a dual source blend factor
-static bool IsDualSourceBlend(
-    VkBlendFactor blend)
-{
-    bool needDualSource = false;
-    switch (blend)
-    {
-    case VK_BLEND_FACTOR_SRC1_COLOR:
-    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
-    case VK_BLEND_FACTOR_SRC1_ALPHA:
-    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
-        needDualSource = true;
-        break;
-    default:
-        needDualSource = false;
-        break;
-    }
-
-    return needDualSource;
-}
-
 // =====================================================================================================================
 void CmdBuffer::SetColorBlendEquation(
     uint32_t                            firstAttachment,
@@ -12232,8 +12304,6 @@ void CmdBuffer::SetColorBlendEquation(
 {
     uint32_t lastAttachment = Util::Min(firstAttachment + attachmentCount, Pal::MaxColorTargets);
 
-    bool dualSourceBlendEnable = false;
-
     for (uint32_t i = firstAttachment; i < lastAttachment; i++)
     {
         const VkColorBlendEquationEXT& colorBlendEquation = pColorBlendEquations[i - firstAttachment];
@@ -12252,7 +12322,6 @@ void CmdBuffer::SetColorBlendEquation(
             (pTarget->srcBlendAlpha  != srcBlendAlpha) ||
             (pTarget->dstBlendAlpha  != dstBlendAlpha) ||
             (pTarget->blendFuncAlpha != blendFuncAlpha))
-
         {
             pTarget->srcBlendColor  = srcBlendColor;
             pTarget->dstBlendColor  = dstBlendColor;
@@ -12261,22 +12330,6 @@ void CmdBuffer::SetColorBlendEquation(
             pTarget->dstBlendAlpha  = dstBlendAlpha;
             pTarget->blendFuncAlpha = blendFuncAlpha;
             m_allGpuState.dirtyGraphics.colorBlend = 1;
-
-            // Dual source blend only support color attachment 0
-            if (i == 0)
-            {
-                dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.srcColorBlendFactor);
-                dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.dstColorBlendFactor);
-                dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.srcAlphaBlendFactor);
-                dualSourceBlendEnable |= IsDualSourceBlend(colorBlendEquation.dstAlphaBlendFactor);
-                auto pDynamicState =
-                    &m_allGpuState.pipelineState[PipelineBindGraphics].dynamicBindInfo.gfx.dynamicState;
-                if (dualSourceBlendEnable != pDynamicState->dualSourceBlendEnable)
-                {
-                    pDynamicState->dualSourceBlendEnable = dualSourceBlendEnable;
-                    m_allGpuState.dirtyGraphics.pipeline = 1;
-                }
-            }
         }
     }
 }
@@ -12585,7 +12638,8 @@ void CmdBuffer::SetDepthClipNegativeOneToOne(
             do
             {
                 PerGpuState(deviceGroup.Index())->viewport.depthRange = depthRange;
-            } while (deviceGroup.IterateNext());
+            }
+            while (deviceGroup.IterateNext());
 
             m_allGpuState.dirtyGraphics.viewport = 1;
             m_allGpuState.staticTokens.viewports = DynamicRenderStateToken;
diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp
index 2d7f4b04..5419d85a 100644
--- a/icd/api/vk_compute_pipeline.cpp
+++ b/icd/api/vk_compute_pipeline.cpp
@@ -185,6 +185,8 @@ VkResult ComputePipeline::Create(
     uint64_t              apiPsoHash = {};
     BuildApiHash(pCreateInfo, shaderInfo, &elfHash, &apiPsoHash);
 
+    binaryCreateInfo.apiPsoHash = apiPsoHash;
+
     const VkPipelineCreationFeedbackCreateInfoEXT* pPipelineCreationFeedbackCreateInfo = nullptr;
     pDefaultCompiler->GetPipelineCreationFeedback(static_cast<const VkStructHeader*>(pCreateInfo->pNext),
         &pPipelineCreationFeedbackCreateInfo);
@@ -408,6 +410,7 @@ VkResult ComputePipeline::Create(
         (result == VK_SUCCESS))
     {
         pBinary = PipelineBinaryInfo::Create(
+            cacheId[DefaultDeviceIndex],
             pipelineBinarySizes[DefaultDeviceIndex],
             pPipelineBinaries[DefaultDeviceIndex],
             pAllocator);
@@ -490,6 +493,8 @@ VkResult ComputePipeline::Create(
 
     if (result == VK_SUCCESS)
     {
+        const Device::DeviceFeatures& deviceFeatures = pDevice->GetEnabledFeatures();
+
         uint64_t durationTicks = Util::GetPerfCpuTime() - startTimeTicks;
         uint64_t duration      = vk::utils::TicksToNano(durationTicks);
 
@@ -503,7 +508,7 @@ VkResult ComputePipeline::Create(
             &binaryCreateInfo.pipelineFeedback,
             &binaryCreateInfo.stageFeedback);
 
-        if (pDevice->GetEnabledFeatures().deviceMemoryReport == true)
+        if (deviceFeatures.gpuMemoryEventHandler)
         {
             size_t numEntries = 0;
             Util::Vector<Pal::GpuMemSubAllocInfo, 1, PalAllocator> palSubAllocInfos(pDevice->VkInstance()->Allocator());
@@ -519,6 +524,7 @@ VkResult ComputePipeline::Create(
             {
                 // Report the Pal suballocation for this pipeline to device_memory_report
                 pDevice->VkInstance()->GetGpuMemoryEventHandler()->ReportDeferredPalSubAlloc(
+                    pDevice,
                     palSubAllocInfos[i].address,
                     palSubAllocInfos[i].offset,
                     ComputePipeline::IntValueFromHandle(*pPipeline),
diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp
index e510b4ff..a6aac8a0 100644
--- a/icd/api/vk_conv.cpp
+++ b/icd/api/vk_conv.cpp
@@ -146,6 +146,10 @@
 #define PalFmt_ASTC(w, h, numfmt) \
     PalFmt(Pal::ChNumFormat::AstcLdr##w##x##h##_##numfmt, PalFmtX, PalFmtY, PalFmtZ, PalFmtW)
 
+// For VK_FORMAT_ASTC_{w}x{h}_SFLOAT_BLOCK_EXT:
+#define PalFmt_ASTC_HDR(w, h, numfmt) \
+    PalFmt(Pal::ChNumFormat::AstcHdr##w##x##h##_##numfmt, PalFmtX, PalFmtY, PalFmtZ, PalFmtW)
+
 // For VK_FORMAT_B{b}G{g}R{r}A{a}_{numfmt}_PACKn:
 #define PalFmt_BGRA_PACK(b, g, r, a, numfmt) \
     PalFmt(Pal::ChNumFormat::X##a##Y##r##Z##g##W##b##_##numfmt, PalFmtY, PalFmtZ, PalFmtW, PalFmtX)
@@ -322,6 +326,20 @@ VK_TO_PAL_STRUC_X(  FORMAT_ASTC_12x10_UNORM_BLOCK,                PalFmt_ASTC(12
 VK_TO_PAL_STRUC_X(  FORMAT_ASTC_12x10_SRGB_BLOCK,                 PalFmt_ASTC(12, 10, Srgb))
 VK_TO_PAL_STRUC_X(  FORMAT_ASTC_12x12_UNORM_BLOCK,                PalFmt_ASTC(12, 12, Unorm))
 VK_TO_PAL_STRUC_X(  FORMAT_ASTC_12x12_SRGB_BLOCK,                 PalFmt_ASTC(12, 12, Srgb))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_4x4_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(4, 4, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_5x4_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(5, 4, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_5x5_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(5, 5, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_6x5_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(6, 5, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_6x6_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(6, 6, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_8x5_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(8, 5, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_8x6_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(8, 6, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_8x8_SFLOAT_BLOCK,                 PalFmt_ASTC_HDR(8, 8, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_10x5_SFLOAT_BLOCK,                PalFmt_ASTC_HDR(10, 5, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_10x6_SFLOAT_BLOCK,                PalFmt_ASTC_HDR(10, 6, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_10x8_SFLOAT_BLOCK,                PalFmt_ASTC_HDR(10, 8, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_10x10_SFLOAT_BLOCK,               PalFmt_ASTC_HDR(10, 10, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_12x10_SFLOAT_BLOCK,               PalFmt_ASTC_HDR(12, 10, Float))
+VK_TO_PAL_STRUC_X(  FORMAT_ASTC_12x12_SFLOAT_BLOCK,               PalFmt_ASTC_HDR(12, 12, Float))
 VK_TO_PAL_STRUC_X(  FORMAT_B4G4R4A4_UNORM_PACK16,                 PalFmt_BGRA_PACK(4, 4, 4, 4, Unorm))
 VK_TO_PAL_STRUC_X(  FORMAT_B5G5R5A1_UNORM_PACK16,                 PalFmt_BGRA_PACK(5, 5, 5, 1, Unorm))
 VK_TO_PAL_STRUC_X(  FORMAT_B5G6R5_UNORM_PACK16,                   PalFmt_BGR_PACK(5, 6, 5, Unorm))
diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp
index de9df969..5671c88d 100644
--- a/icd/api/vk_descriptor_pool.cpp
+++ b/icd/api/vk_descriptor_pool.cpp
@@ -159,9 +159,8 @@ VkResult DescriptorPool::Init(
 
                     if (m_pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead)
                     {
-                        m_addresses[deviceIdx].fmaskCpuAddr = static_cast<uint32_t*>(m_pHostOnlyMemory);
-                            static_cast<uint32_t*>(Util::VoidPtrInc(m_pHostOnlyMemory,
-                                memReqs.size * numPalDevices + memReqs.size * deviceIdx));
+                        m_addresses[deviceIdx].fmaskCpuAddr =  static_cast<uint32_t*>(Util::VoidPtrInc(m_pHostOnlyMemory,
+                            memReqs.size * numPalDevices + memReqs.size * deviceIdx));
                     }
                 }
 
diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp
index f7a6bbc8..4b589b91 100644
--- a/icd/api/vk_descriptor_set.cpp
+++ b/icd/api/vk_descriptor_set.cpp
@@ -795,18 +795,29 @@ void DescriptorUpdate::CopyDescriptorSets(
             VK_ASSERT(destBinding.sta.dwArrayStride > 0);
             VK_ASSERT(srcBinding.sta.dwArrayStride > 0);
             uint32_t* pSrcAddr  = pSrcSet->StaticCpuAddress(deviceIdx) + srcBinding.sta.dwOffset
-                                + params.srcArrayElement * srcBinding.sta.dwArrayStride * sizeof(uint32_t);
+                                + params.srcArrayElement * srcBinding.sta.dwArrayStride;
 
             uint32_t* pDestAddr = pDestSet->StaticCpuAddress(deviceIdx) + destBinding.sta.dwOffset
-                                + params.dstArrayElement * destBinding.sta.dwArrayStride * sizeof(uint32_t);
+                                + params.dstArrayElement * destBinding.sta.dwArrayStride;
 
-            for (uint32_t j = 0; j < count; ++j)
+            if (srcBinding.sta.dwArrayStride == destBinding.sta.dwArrayStride)
+            {
+                // Source and destination have the same memory layout of array elements.
+                memcpy(pDestAddr, pSrcAddr, srcBinding.sta.dwArrayStride * sizeof(uint32_t) * count);
+            }
+            else
             {
-                uint32_t dstSizeInDw = destBinding.sta.dwArrayStride;
-                uint32_t srcSizeInDw = srcBinding.sta.dwArrayStride;
-                memcpy(pDestAddr + j * dstSizeInDw, pSrcAddr + j * srcSizeInDw,
-                    Util::Min(destBinding.sta.dwArrayStride * sizeof(uint32_t),
-                        srcBinding.sta.dwArrayStride * sizeof(uint32_t)));
+                const auto arrayElementSize = Util::Min(
+                            destBinding.sta.dwArrayStride * sizeof(uint32_t),
+                            srcBinding.sta.dwArrayStride * sizeof(uint32_t));
+
+                for (uint32_t j = 0; j < count; ++j)
+                {
+                    memcpy(
+                        pDestAddr + j * destBinding.sta.dwArrayStride,
+                        pSrcAddr  + j * srcBinding.sta.dwArrayStride,
+                        arrayElementSize);
+                }
             }
         }
         else if ((srcBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) ||
diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp
index 9ed730d4..9d9e0454 100644
--- a/icd/api/vk_device.cpp
+++ b/icd/api/vk_device.cpp
@@ -376,14 +376,6 @@ VkResult Device::Create(
                   enabledDeviceExtensions.IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1)             == false);
     }
 
-    if (enabledDeviceExtensions.IsExtensionEnabled(DeviceExtensions::EXT_EXTENDED_DYNAMIC_STATE3))
-    {
-        if (pPhysicalDevice->GetRuntimeSettings().dynamicPrimitiveTopologyUnrestricted)
-        {
-            deviceFeatures.dynamicPrimitiveTopologyUnrestricted = true;
-        }
-    }
-
     uint32_t                          numDevices                      = 1;
     PhysicalDevice*                   pPhysicalDevices[MaxPalDevices] = { pPhysicalDevice              };
     Pal::IDevice*                     pPalDevices[MaxPalDevices]      = { pPhysicalDevice->PalDevice() };
@@ -641,6 +633,30 @@ VkResult Device::Create(
                 break;
             }
 
+            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ADDRESS_BINDING_REPORT_FEATURES_EXT:
+            {
+                const VkPhysicalDeviceAddressBindingReportFeaturesEXT* pAddressBindingReportFeaturesEXT =
+                    reinterpret_cast<const VkPhysicalDeviceAddressBindingReportFeaturesEXT*>(pHeader);
+
+                if (pAddressBindingReportFeaturesEXT->reportAddressBinding)
+                {
+                    deviceFeatures.deviceAddressBindingReport = true;
+                    deviceFeatures.gpuMemoryEventHandler      = true;
+
+                    uint32 enabledCallbacks = pInstance->PalPlatform()->GetEnabledCallbackTypes();
+
+                    enabledCallbacks |= 1 << static_cast<uint32>(Pal::Developer::CallbackType::AllocGpuMemory);
+                    enabledCallbacks |= 1 << static_cast<uint32>(Pal::Developer::CallbackType::FreeGpuMemory);
+                    enabledCallbacks |= 1 << static_cast<uint32>(Pal::Developer::CallbackType::SubAllocGpuMemory);
+                    enabledCallbacks |= 1 << static_cast<uint32>(Pal::Developer::CallbackType::SubFreeGpuMemory);
+                    enabledCallbacks |= 1 << static_cast<uint32>(Pal::Developer::CallbackType::BindGpuMemory);
+
+                    pInstance->PalPlatform()->SetEnabledCallbackTypes(enabledCallbacks);
+                }
+
+                break;
+            }
+
             default:
                 break;
             }
@@ -689,6 +705,15 @@ VkResult Device::Create(
             deviceFeatures.mustWriteImmutableSamplers = false;
         }
 
+        if (enabledDeviceExtensions.IsExtensionEnabled(DeviceExtensions::EXT_EXTENDED_DYNAMIC_STATE3))
+        {
+            if (pPhysicalDevice->GetRuntimeSettings().dynamicPrimitiveTopologyUnrestricted)
+            {
+                deviceFeatures.dynamicPrimitiveTopologyUnrestricted = true;
+                deviceFeatures.assumeDynamicTopologyInLibs = deviceFeatures.graphicsPipelineLibrary;
+            }
+        }
+
         if ((pPhysicalDevice->GetRuntimeSettings().strictImageSizeRequirements == StrictImageSizeOn) ||
             ((pPhysicalDevice->GetRuntimeSettings().strictImageSizeRequirements == StrictImageSizeAppControlled) &&
              maintenance4Enabled))
@@ -804,9 +829,8 @@ VkResult Device::Create(
 
             case VK_STRUCTURE_TYPE_DEVICE_DEVICE_MEMORY_REPORT_CREATE_INFO_EXT:
             {
-                deviceFeatures.deviceMemoryReport = true;
-
-                pInstance->GetGpuMemoryEventHandler()->EnableGpuMemoryEvents();
+                deviceFeatures.deviceMemoryReport    = true;
+                deviceFeatures.gpuMemoryEventHandler = true;
 
                 uint32 enabledCallbacks = pInstance->PalPlatform()->GetEnabledCallbackTypes();
 
@@ -878,8 +902,7 @@ VkResult Device::Create(
     if (vkResult == VK_SUCCESS)
     {
         pMemory = pInstance->AllocMem(
-            privateDataSize + apiDeviceSize
-            ,
+            privateDataSize + apiDeviceSize,
             VK_DEFAULT_MEM_ALIGN,
             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
@@ -918,6 +941,11 @@ VkResult Device::Create(
         auto gpuMemoryEventHandler = pInstance->GetGpuMemoryEventHandler();
         Device* pDevice            = ApiDevice::ObjectFromHandle(reinterpret_cast<VkDevice>(pDispatchableDevice));
 
+        if (deviceFeatures.gpuMemoryEventHandler)
+        {
+            gpuMemoryEventHandler->EnableGpuMemoryEvents(pDevice);
+        }
+
         for (auto iter = deviceMemoryReportCallbacks.Begin(); iter.IsValid(); iter.Next())
         {
             iter.Get().pDevice = pDevice;
@@ -997,12 +1025,6 @@ VkResult Device::Create(
     return vkResult;
 }
 
-// =====================================================================================================================
-
-// ==================================================================================================================== =
-
-// =====================================================================================================================
-
 #if VKI_RAY_TRACING
 // =====================================================================================================================
 VkResult Device::CreateRayTraceState()
@@ -1685,14 +1707,16 @@ VkResult Device::Destroy(const VkAllocationCallbacks* pAllocator)
 
     m_renderStateCache.Destroy();
 
-    const bool deviceMemoryReportEnabled = m_enabledFeatures.deviceMemoryReport;
-
     Util::Destructor(this);
 
-    if (deviceMemoryReportEnabled == true)
+    if (m_enabledFeatures.deviceMemoryReport)
     {
         VkInstance()->GetGpuMemoryEventHandler()->UnregisterDeviceMemoryReportCallbacks(this);
-        VkInstance()->GetGpuMemoryEventHandler()->DisableGpuMemoryEvents();
+    }
+
+    if (m_enabledFeatures.gpuMemoryEventHandler)
+    {
+        VkInstance()->GetGpuMemoryEventHandler()->DisableGpuMemoryEvents(this);
     }
 
     FreeApiObject(VkInstance()->GetAllocCallbacks(), ApiDevice::FromObject(this));
@@ -1744,7 +1768,7 @@ VkResult Device::CreateInternalComputePipeline(
     const uint8_t*                 pCode,
     uint32_t                       numUserDataNodes,
     Vkgc::ResourceMappingRootNode* pUserDataNodes,
-    VkShaderModuleCreateFlags      flags,
+    VkShaderModuleCreateFlags      internalShaderFlags,
     bool                           forceWave64,
     const VkSpecializationInfo*    pSpecializationInfo,
     InternalPipeline*              pInternalPipeline)
@@ -1775,7 +1799,8 @@ VkResult Device::CreateInternalComputePipeline(
     // Build shader module
     result = pCompiler->BuildShaderModule(
         this,
-        flags,
+        0,
+        internalShaderFlags,
         codeByteSize,
         pCode,
         false,
@@ -1927,7 +1952,7 @@ VkResult Device::CreateInternalComputePipeline(
         }
         memcpy(pInternalPipeline->pPipeline, pPipeline, sizeof(pPipeline));
 
-        if (GetEnabledFeatures().deviceMemoryReport == true)
+        if (GetEnabledFeatures().gpuMemoryEventHandler)
         {
             size_t numEntries = 0;
             Util::Vector<Pal::GpuMemSubAllocInfo, 1, PalAllocator> palSubAllocInfos(VkInstance()->Allocator());
@@ -1943,6 +1968,7 @@ VkResult Device::CreateInternalComputePipeline(
                 // Report the Pal suballocation for this pipeline to device_memory_report
                 // Internal pipelines are attributed to the device
                 VkInstance()->GetGpuMemoryEventHandler()->ReportDeferredPalSubAlloc(
+                    this,
                     palSubAllocInfos[i].address,
                     palSubAllocInfos[i].offset,
                     DispatchableDevice::IntValueFromHandle(DispatchableDevice::FromObject(this)),
@@ -2203,23 +2229,21 @@ void Device::GetQueue2(
 
     uint32 queueCount = VkPhysicalDevice(DefaultDeviceIndex)->GetQueueFamilyProperties(queueFamilyIndex).queueCount;
 
+    // Queues with flags will be indexed separately to queues without flags
+    // Consider only those queues with matching flags
+    uint32 testIndex = 0;
+    for (uint32 i = 0; i < queueCount; i++)
     {
-        // Queues with flags will be indexed separately to queues without flags
-        // Consider only those queues with matching flags
-        uint32 testIndex = 0;
-        for (uint32 i = 0; i < queueCount; i++)
-        {
-            DispatchableQueue* pFoundQueue = m_pQueues[queueFamilyIndex][i];
+        DispatchableQueue* pFoundQueue = m_pQueues[queueFamilyIndex][i];
 
-            if ((pFoundQueue != nullptr) && ((*pFoundQueue)->GetFlags() == flags))
+        if ((pFoundQueue != nullptr) && ((*pFoundQueue)->GetFlags() == flags))
+        {
+            if (testIndex == queueIndex)
             {
-                if (testIndex == queueIndex)
-                {
-                    *pQueue = reinterpret_cast<VkQueue>(pFoundQueue);
-                    break;
-                }
-                testIndex++;
+                *pQueue = reinterpret_cast<VkQueue>(pFoundQueue);
+                break;
             }
+            testIndex++;
         }
     }
 }
@@ -2237,8 +2261,6 @@ Pal::PrtFeatureFlags Device::GetPrtFeatures() const
     return featureFlags;
 }
 
-// =====================================================================================================================
-
 // =====================================================================================================================
 VkResult Device::WaitForFences(
     uint32_t       fenceCount,
@@ -4835,7 +4857,6 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetRayTracingCaptureReplayShaderGroupHandlesKHR
     // replaying and we will make use of them in vkCreateRayTracingPipelinesKHR.
     RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pipeline);
 
-    // #raytracing: MGPU support - Return based on DefaultDeviceIndex since the result shouldn't vary between GPUs.
     pPipeline->GetRayTracingShaderGroupHandles(DefaultDeviceIndex, firstGroup, groupCount, dataSize, pData);
 
     return VK_SUCCESS;
diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp
index 2400b9bc..cf59ef01 100644
--- a/icd/api/vk_dispatch.cpp
+++ b/icd/api/vk_dispatch.cpp
@@ -806,6 +806,7 @@ void DispatchTable::Init()
 
     INIT_DISPATCH_ENTRY(vkGetShaderModuleIdentifierEXT                  );
     INIT_DISPATCH_ENTRY(vkGetShaderModuleCreateInfoIdentifierEXT        );
+
 }
 
 // =====================================================================================================================
diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp
index 276a42ca..c4792a75 100644
--- a/icd/api/vk_graphics_pipeline.cpp
+++ b/icd/api/vk_graphics_pipeline.cpp
@@ -448,6 +448,7 @@ VkResult GraphicsPipeline::CreatePipelineObjects(
         (result == VK_SUCCESS))
     {
         pBinaryInfo = PipelineBinaryInfo::Create(
+            pCacheIds[DefaultDeviceIndex],
             pPipelineBinarySizes[DefaultDeviceIndex],
             pPipelineBinaries[DefaultDeviceIndex],
             pAllocator);
@@ -582,6 +583,8 @@ VkResult GraphicsPipeline::Create(
     Util::MetroHash::Hash elfHash    = {};
     BuildApiHash(pCreateInfo, &apiPsoHash, &elfHash);
 
+    binaryCreateInfo.apiPsoHash = apiPsoHash;
+
     // 4. Get pipeline layout
     VK_ASSERT(pCreateInfo->layout != VK_NULL_HANDLE);
     PipelineLayout* pPipelineLayout = PipelineLayout::ObjectFromHandle(pCreateInfo->layout);
@@ -699,6 +702,8 @@ VkResult GraphicsPipeline::Create(
 
     if (result == VK_SUCCESS)
     {
+        const Device::DeviceFeatures& deviceFeatures = pDevice->GetEnabledFeatures();
+
         uint64_t durationTicks = Util::GetPerfCpuTime() - startTimeTicks;
         uint64_t duration = vk::utils::TicksToNano(durationTicks);
         binaryCreateInfo.pipelineFeedback.feedbackValid = true;
@@ -710,7 +715,7 @@ VkResult GraphicsPipeline::Create(
             &binaryCreateInfo.pipelineFeedback,
             binaryCreateInfo.stageFeedback);
 
-        if (pDevice->GetEnabledFeatures().deviceMemoryReport == true)
+        if (deviceFeatures.gpuMemoryEventHandler)
         {
             size_t numEntries = 0;
             Util::Vector<Pal::GpuMemSubAllocInfo, 1, PalAllocator> palSubAllocInfos(pDevice->VkInstance()->Allocator());
@@ -726,6 +731,7 @@ VkResult GraphicsPipeline::Create(
             {
                 // Report the Pal suballocation for this pipeline to device_memory_report
                 pDevice->VkInstance()->GetGpuMemoryEventHandler()->ReportDeferredPalSubAlloc(
+                    pDevice,
                     palSubAllocInfos[i].address,
                     palSubAllocInfos[i].offset,
                     GraphicsPipeline::IntValueFromHandle(*pPipeline),
@@ -1191,6 +1197,11 @@ GraphicsPipeline::GraphicsPipeline(
         m_info.graphicsShaderInfos.dynamicState.enable.dualSourceBlendEnable = 1;
     }
 
+    if (ContainsDynamicState(DynamicStatesInternal::VertexInput))
+    {
+        m_info.graphicsShaderInfos.dynamicState.enable.vertexBufferCount = 1;
+    }
+
     pPalPipelineHasher->Update(m_palPipelineHash);
     pPalPipelineHasher->Finalize(reinterpret_cast<uint8*>(&m_palPipelineHash));
 }
diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp
index e97b567d..7e7d1ded 100644
--- a/icd/api/vk_graphics_pipeline_library.cpp
+++ b/icd/api/vk_graphics_pipeline_library.cpp
@@ -308,6 +308,7 @@ static GraphicsPipelineBinaryCreateInfo* DumpGraphicsPipelineBinaryCreateInfo(
 // =====================================================================================================================
 VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary(
     const Device*                           pDevice,
+    PipelineCache*                          pPipelineCache,
     const VkGraphicsPipelineCreateInfo*     pCreateInfo,
     const GraphicsPipelineLibraryInfo*      pLibInfo,
     const GraphicsPipelineShaderStageInfo*  pShaderStageInfo,
@@ -317,7 +318,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary(
 {
     VkResult          result            = VK_SUCCESS;
     PipelineCompiler* pCompiler         = pDevice->GetCompiler(DefaultDeviceIndex);
-    uint32_t          dynamicStateFlags = GetDynamicStateFlags(pCreateInfo->pDynamicState, pLibInfo);
+    uint64_t          dynamicStateFlags = GetDynamicStateFlags(pCreateInfo->pDynamicState, pLibInfo);
 
     // Pipeline info only includes the shaders that match the enabled VkGraphicsPipelineLibraryFlagBitsEXT.
     // Use this information to skip the compilation of unused shader modules.
@@ -348,7 +349,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary(
             {
                 // We don't take care of the result. Early compile failure in some cases is expected
                 result = pCompiler->CreateGraphicsShaderBinary(
-                    pDevice, pShaderStageInfo->stages[i].stage, pBinaryCreateInfo, &pTempModules[i]);
+                    pDevice, pPipelineCache, pShaderStageInfo->stages[i].stage, pBinaryCreateInfo, &pTempModules[i]);
             }
 
             pTempModuleStages[i].stage          = pShaderStageInfo->stages[i].stage;
@@ -375,7 +376,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary(
                 pTempModules[TempIdx] = *pParentHandle;
 
                 result = pCompiler->CreateGraphicsShaderBinary(
-                    pDevice, ShaderStage::ShaderStageVertex, pBinaryCreateInfo, &pTempModules[TempIdx]);
+                    pDevice, pPipelineCache, ShaderStage::ShaderStageVertex, pBinaryCreateInfo, &pTempModules[TempIdx]);
 
                 pTempModuleStages[TempIdx].stage          = ShaderStage::ShaderStageVertex;
                 pTempModuleStages[TempIdx].freeBinaryOnly = true;
@@ -396,8 +397,8 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary(
 
                 pTempModules[TempIdx] = *pParentHandle;
 
-                result = pCompiler->CreateGraphicsShaderBinary(
-                    pDevice, ShaderStage::ShaderStageFragment, pBinaryCreateInfo, &pTempModules[TempIdx]);
+                result = pCompiler->CreateGraphicsShaderBinary(pDevice, pPipelineCache,
+                    ShaderStage::ShaderStageFragment, pBinaryCreateInfo, &pTempModules[TempIdx]);
 
                 pTempModuleStages[TempIdx].stage          = ShaderStage::ShaderStageFragment;
                 pTempModuleStages[TempIdx].freeBinaryOnly = true;
@@ -407,12 +408,7 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary(
 
     for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; ++i)
     {
-        if (pCompiler->IsValidShaderModule(&pTempModules[i]))
-        {
-            PipelineCompiler::SetPartialGraphicsPipelineBinaryInfo(
-                &pTempModules[i], pTempModuleStages[i].stage, pBinaryCreateInfo);
-        }
-        else
+        if (pCompiler->IsValidShaderModule(&pTempModules[i]) == false)
         {
             pTempModuleStages[i].stage = ShaderStage::ShaderStageInvalid;
         }
@@ -443,6 +439,8 @@ VkResult GraphicsPipelineLibrary::Create(
     ShaderModuleHandle               tempModules[ShaderStage::ShaderStageGfxCount] = {};
     TempModuleState                  tempModuleStates[ShaderStage::ShaderStageGfxCount] = {};
 
+    binaryCreateInfo.pipelineInfo.iaState.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+
     // 1. Build shader stage infos
     if (result == VK_SUCCESS)
     {
@@ -481,6 +479,7 @@ VkResult GraphicsPipelineLibrary::Create(
     uint64_t              apiPsoHash = {};
     Util::MetroHash::Hash elfHash    = {};
     BuildApiHash(pCreateInfo, &apiPsoHash, &elfHash);
+    binaryCreateInfo.apiPsoHash = apiPsoHash;
 
     // 4. Get pipeline layout
     const PipelineLayout* pPipelineLayout = PipelineLayout::ObjectFromHandle(pCreateInfo->layout);
@@ -510,6 +509,7 @@ VkResult GraphicsPipelineLibrary::Create(
         // 6. Create partial pipeline binary for fast-link
         result = CreatePartialPipelineBinary(
             pDevice,
+            pPipelineCache,
             pCreateInfo,
             &libInfo,
             &shaderStageInfo,
diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp
index 06d144f7..c8a4635d 100644
--- a/icd/api/vk_image.cpp
+++ b/icd/api/vk_image.cpp
@@ -1789,6 +1789,9 @@ void Image::CalculateMemoryRequirementsInternal(
             ~pDevice->GetMemoryTypeMaskMatching(VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD);
     }
 
+    // Images are not using memoryType for DescriptorBuffers
+    pMemoryRequirements->memoryTypeBits &= ~pDevice->GetMemoryTypeMaskForDescriptorBuffers();
+
     // Add an extra memory padding. This can be enabled while capturing GFXR traces and disabled later. Capturing with
     // this setting enabled helps in replaying GFXR traces. When this setting is not used while capture, GFXR might
     // return a fatal error while replaying with different DCC threshold values. This is caused because gfxreconstruct
diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp
index 0cf27269..d7e17eab 100644
--- a/icd/api/vk_instance.cpp
+++ b/icd/api/vk_instance.cpp
@@ -403,6 +403,8 @@ VkResult Instance::Init(
 
     if (status == VK_SUCCESS)
     {
+        Pal::IPlatform::InstallDeveloperCb(m_pPalPlatform, &Instance::PalDeveloperCallback, this);
+
         // Get the platform property. Vulkan doesn't use it so far.
         Pal::PlatformProperties platformProps;
 
@@ -477,13 +479,6 @@ VkResult Instance::Init(
         }
     }
 
-    // Install PAL developer callback if the SQTT layer is enabled.  This is required to trap internal barriers
-    // and dispatches performed by PAL so that they can be correctly annotated to RGP.
-    if (status == VK_SUCCESS)
-    {
-        Pal::IPlatform::InstallDeveloperCb(m_pPalPlatform, &Instance::PalDeveloperCallback, this);
-    }
-
     if (status == VK_SUCCESS)
     {
         size_t screenSize = m_pPalPlatform->GetScreenObjectSize();
@@ -567,6 +562,10 @@ VkResult Instance::Init(
         InitDispatchTable();
 
 #if DEBUG
+        // Optionally wait for a debugger to be attached
+        utils::WaitIdleForDebugger(pPhysicalDevice->GetRuntimeSettings().waitForDebugger,
+            &pPhysicalDevice->GetRuntimeSettings().waitForDebuggerExecutableName[0],
+            pPhysicalDevice->GetRuntimeSettings().debugTimeout);
 #endif
     }
 
@@ -1157,6 +1156,8 @@ void PAL_STDCALL Instance::PalDeveloperCallback(
 
     if (pInstance->IsTracingSupportEnabled())
     {
+        // This is required to trap internal barriers and dispatches performed by PAL so that they can be correctly
+        // annotated to RGP.
         SqttMgr::PalDeveloperCallback(pInstance, deviceIndex, type, pCbData);
     }
 
diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp
index 1c511d04..a713a1d9 100644
--- a/icd/api/vk_memory.cpp
+++ b/icd/api/vk_memory.cpp
@@ -366,6 +366,8 @@ VkResult Memory::Create(
         }
     }
 
+    const Device::DeviceFeatures& deviceFeatures = pDevice->GetEnabledFeatures();
+
     if (vkResult == VK_SUCCESS)
     {
         // Account for committed size in logical device. The destructor will decrease the counter accordingly.
@@ -399,9 +401,10 @@ VkResult Memory::Create(
 
         if (pPalGpuMem != nullptr)
         {
-            if (pDevice->GetEnabledFeatures().deviceMemoryReport == true)
+            if (deviceFeatures.gpuMemoryEventHandler)
             {
                 pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanAllocateEvent(
+                    pDevice,
                     pPalGpuMem,
                     Memory::IntValueFromHandle(*pMemoryHandle),
                     VK_OBJECT_TYPE_DEVICE_MEMORY,
@@ -431,9 +434,10 @@ VkResult Memory::Create(
     }
     else
     {
-        if (pDevice->GetEnabledFeatures().deviceMemoryReport == true)
+        if (deviceFeatures.deviceMemoryReport)
         {
             pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanAllocationFailedEvent(
+                pDevice,
                 pAllocInfo->allocationSize,
                 VK_OBJECT_TYPE_DEVICE_MEMORY,
                 pAllocInfo->memoryTypeIndex);
@@ -976,11 +980,6 @@ void Memory::Free(
         Pal::IGpuMemory* pGpuMemory = m_pPalMemory[i][i];
         if (pGpuMemory != nullptr)
         {
-            if (pDevice->GetEnabledFeatures().deviceMemoryReport == true)
-            {
-                pDevice->VkInstance()->GetGpuMemoryEventHandler()->VulkanFreeEvent(pGpuMemory);
-            }
-
             Pal::IDevice* pPalDevice = pDevice->PalDevice(i);
             pDevice->RemoveMemReference(pPalDevice, pGpuMemory);
 
diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp
index a77e55ff..49a2b7ba 100644
--- a/icd/api/vk_physical_device.cpp
+++ b/icd/api/vk_physical_device.cpp
@@ -297,6 +297,31 @@ static bool VerifyAstcLdrFormatSupport(
         VerifyFormatSupport(dev, VK_FORMAT_ASTC_12x12_SRGB_BLOCK,  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
     return astcLdrSupport;
 }
+// =====================================================================================================================
+// Returns true if the given physical device supports the minimum required compressed texture formats to report ASTC-HDR
+// support
+static VkBool32 VerifyAstcHdrFormatSupport(
+    const PhysicalDevice& dev)
+{
+    // Based on vulkan spec Table 68. ASTC HDR compressed formats with VkImageType
+    // VK_IMAGE_TYPE_2D
+    const VkBool32 astcHdrSupport =
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_6x5_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_6x6_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_8x5_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_8x6_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_8x8_SFLOAT_BLOCK,   1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x5_SFLOAT_BLOCK,  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x6_SFLOAT_BLOCK,  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x8_SFLOAT_BLOCK,  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_10x10_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_12x10_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) &&
+        VerifyFormatSupport(dev, VK_FORMAT_ASTC_12x12_SFLOAT_BLOCK, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    return astcHdrSupport;
+}
 
 // =====================================================================================================================
 // Returns true if the given physical device supports the minimum required BC compressed texture format
@@ -633,7 +658,7 @@ static void GenerateCacheUuid(
         uint32               vulkanIcdVersion;
         uint32               palInterfaceVersion;
         uint32               osHash;
-        uint32               buildTimeHas;
+        uint32               buildTimeHash;
     } cacheVersionInfo =
     {
         Util::HashLiteralString("pipelineCache"),
@@ -1010,7 +1035,9 @@ VkResult PhysicalDevice::Initialize()
 
     m_memoryTypeMaskForExternalSharing = m_memoryTypeMask;
 
-    if (result == Pal::Result::Success)
+    VkResult vkResult = PalToVkResult(result);
+
+    if (vkResult == VK_SUCCESS)
     {
         // Determine if EQAA is supported by checking if, for each MSAA fragment count, all sample combos are okay.
         const auto& imgProps = PalProperties().imageProperties;
@@ -1033,26 +1060,15 @@ VkResult PhysicalDevice::Initialize()
             m_eqaaSupported &= Util::TestAllFlagsSet(imgProps.msaaSupport, Pal::MsaaFlags::MsaaAllF1);
             break;
         }
-    }
 
-    // Generate our cache UUID.
-    // This can be use later as a "namespace" for Uuid3()/Uuid5() calls for individual pipelines
-    if (result == Pal::Result::Success)
-    {
+        // Generate our cache UUID.
+        // This can be use later as a "namespace" for Uuid3()/Uuid5() calls for individual pipelines
         GenerateCacheUuid(settings, PalProperties(), m_appProfile, &m_pipelineCacheUUID);
-    }
 
-    // Collect properties for perf experiments (this call can fail; we just don't report support for
-    // perf measurement extension then)
-    if (result == Pal::Result::Success)
-    {
+        // Collect properties for perf experiments (this call can fail; we just don't report support for
+        // perf measurement extension then)
         PopulateGpaProperties();
-    }
 
-    VkResult vkResult = PalToVkResult(result);
-
-    if (vkResult == VK_SUCCESS)
-    {
         InitializePlatformKey(settings);
         vkResult = m_compiler.Initialize();
     }
@@ -2505,72 +2521,48 @@ void PhysicalDevice::PopulateLimits()
     // Maximum number of components of output variables which may be output by a vertex shader.
     m_limits.maxVertexOutputComponents = 128;
 
-    // OGL: SI_MAX_VP_VARYING_COMPONENTS
-
     // Maximum tessellation generation level supported by the fixed function tessellation primitive generator.
     m_limits.maxTessellationGenerationLevel = 64;
 
-    // OGL: SI_MAX_TESS_FACTOR
-
     // Maximum patch size, in vertices, of patches that can be processed by the tessellation primitive generator.
     // This is specified by the patchControlPoints of the VkPipelineTessellationStateCreateInfo structure.
     m_limits.maxTessellationPatchSize = 32;
 
-    // OGL: pHpCaps->maxVertexCountPerPatch = SI_MAX_VERTEX_COUNT_PER_PATCH;
-
     // Maximum number of components of input variables which may be provided as per-vertex inputs to the tessellation
     // control shader stage.
     m_limits.maxTessellationControlPerVertexInputComponents = 128;
 
-    // OGL: pHpCaps->maxTessControlInputComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS;
-
     // Maximum number of components of per-vertex output variables which may be output from the tessellation control
     // shader stage.
     m_limits.maxTessellationControlPerVertexOutputComponents = 128;
 
-    // OGL: pHpCaps->maxHullVaryingComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS;
-
     // Maximum number of components of per-patch output variables which may be output from the tessellation control
     // shader stage.
     m_limits.maxTessellationControlPerPatchOutputComponents = 120;
 
-    // OGL: pHpCaps->maxTessControlPatchComponents = SI_MAX_TESS_CONTROL_PATCH_COMPONENTS;
-
     // Maximum total number of components of per-vertex and per-patch output variables which may be output from the
     // tessellation control shader stage.  (The total number of components of active per-vertex and per-patch outputs is
     // derived by multiplying the per-vertex output component count by the output patch size and then adding the
     // per-patch output component count.  The total component count may not exceed this limit.)
     m_limits.maxTessellationControlTotalOutputComponents = 4096;
 
-    // OGL: pHpCaps->maxTessControlTotalOutputComponents = SI_MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS;
-
     // Maximum number of components of input variables which may be provided as per-vertex inputs to the tessellation
     // evaluation shader stage.
     m_limits.maxTessellationEvaluationInputComponents = 128;
 
-    // OGL: pDpCaps->maxTessEvaluationInputComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS [sic]
-
     // Maximum number of components of per-vertex output variables which may be output from the tessellation evaluation
     // shader stage
     m_limits.maxTessellationEvaluationOutputComponents = 128;
 
-    // OGL: pDpCaps->maxDomainVaryingComponents = SI_MAX_TESS_CONTROL_INPUT_COMPONENTS [sic]
-
     // Maximum invocation count (per input primitive) supported for an instanced geometry shader.
     m_limits.maxGeometryShaderInvocations = palProps.gfxipProperties.maxGsInvocations;
 
-    // OGL: pGpCaps->maxGeometryInvocations = SI_MAX_GP_INVOCATIONS
-
     // Maximum number of components of input variables which may be provided as inputs to the geometry shader stage
     m_limits.maxGeometryInputComponents = 128;
 
-    // OGL: pGpCaps->maxGeometryVaryingComponents = SI_MAX_GP_VARYING_COMPONENTS
-
     // Maximum number of components of output variables which may be output from the geometry shader stage.
     m_limits.maxGeometryOutputComponents = 128;
 
-    // OGL: pGpCaps->maxGeometryVaryingComponents = SI_MAX_GP_VARYING_COMPONENTS; (NOTE: Not a separate cap)
-
     // Maximum number of vertices which may be emitted by any geometry shader.
     m_limits.maxGeometryOutputVertices = palProps.gfxipProperties.maxGsOutputVert;
 
@@ -2581,8 +2573,6 @@ void PhysicalDevice::PopulateLimits()
     // Maximum number of components of input variables which may be provided as inputs to the fragment shader stage.
     m_limits.maxFragmentInputComponents = 128;
 
-    // OGL: pFpCaps->maxFragmentInputComponents = SI_MAX_VP_VARYING_COMPONENTS;
-
     // Maximum number of output attachments which may be written to by the fragment shader stage.
     m_limits.maxFragmentOutputAttachments = Pal::MaxColorTargets;
 
@@ -2590,8 +2580,6 @@ void PhysicalDevice::PopulateLimits()
     // enabled and one of the dual source blend modes is in use.
     m_limits.maxFragmentDualSrcAttachments = 1;
 
-    // OGL: pCaps->buf.maxDualSourceDrawBuf = SI_MAX_DUAL_SOURCE_COLOR_BUFFERS;
-
     // NOTE: This could be num_cbs / 2 = 4.  When dual source blending is on, two source colors are written per
     // attachment and to facilitate this the HW operates such that the odd-numbered CBs do not get used.  OGL still
     // reports only 1 dual source attachment though, and I think DX API spec locks you into a single dual source
@@ -2616,8 +2604,6 @@ void PhysicalDevice::PopulateLimits()
     m_limits.maxComputeWorkGroupCount[1] = 65535;
     m_limits.maxComputeWorkGroupCount[2] = 65535;
 
-    // OGL: pCpCaps->maxComputeWorkGroupCount[i] = SI_MAX_WORK_GROUP_COUNT;
-
     const uint32_t clampedMaxThreads = Util::Min(palProps.gfxipProperties.maxThreadGroupSize,
                                                  palProps.gfxipProperties.maxAsyncComputeThreadGroupSize);
 
@@ -3827,6 +3813,9 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions(
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_QUEUE_FAMILY_FOREIGN));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_DESCRIPTOR_INDEXING));
 
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(VALVE_MUTABLE_DESCRIPTOR_TYPE));
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MUTABLE_DESCRIPTOR_TYPE));
+
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_VARIABLE_POINTERS));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_VERTEX_ATTRIBUTE_DIVISOR));
 
@@ -4107,6 +4096,12 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions(
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_PHYSICAL_DEVICE_DRM));
 #endif
 
+    if ((pPhysicalDevice == nullptr) ||
+        VerifyAstcHdrFormatSupport(*pPhysicalDevice))
+    {
+        availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_TEXTURE_COMPRESSION_ASTC_HDR));
+    }
+
     return availableExtensions;
 }
 
@@ -5940,7 +5935,7 @@ size_t PhysicalDevice::GetFeatures2(
                     pExtInfo->subgroupSizeControl                                = VK_TRUE;
                     pExtInfo->computeFullSubgroups                               = VK_TRUE;
                     pExtInfo->synchronization2                                   = VK_TRUE;
-                    pExtInfo->textureCompressionASTC_HDR                         = VK_FALSE;
+                    pExtInfo->textureCompressionASTC_HDR                         = VerifyAstcHdrFormatSupport(*this);
                     pExtInfo->shaderZeroInitializeWorkgroupMemory                = VK_TRUE;
                     pExtInfo->dynamicRendering                                   = VK_TRUE;
                     pExtInfo->shaderIntegerDotProduct                            = VK_TRUE;
@@ -6551,7 +6546,7 @@ size_t PhysicalDevice::GetFeatures2(
 
                 if (updateFeatures)
                 {
-                    pExtInfo->textureCompressionASTC_HDR = VK_FALSE;
+                    pExtInfo->textureCompressionASTC_HDR = VerifyAstcHdrFormatSupport(*this);
                 }
 
                 structSize = sizeof(*pExtInfo);
@@ -6593,6 +6588,19 @@ size_t PhysicalDevice::GetFeatures2(
                 break;
             }
 
+            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ADDRESS_BINDING_REPORT_FEATURES_EXT:
+            {
+                auto* pExtInfo = reinterpret_cast<VkPhysicalDeviceAddressBindingReportFeaturesEXT*>(pHeader);
+
+                if (updateFeatures)
+                {
+                    pExtInfo->reportAddressBinding = VK_TRUE;
+                }
+
+                structSize = sizeof(*pExtInfo);
+                break;
+            }
+
             case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FAULT_FEATURES_EXT:
             {
                 auto* pExtInfo = reinterpret_cast<VkPhysicalDeviceFaultFeaturesEXT*>(pHeader);
diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp
index cce9e3e0..5e2be2f2 100644
--- a/icd/api/vk_pipeline.cpp
+++ b/icd/api/vk_pipeline.cpp
@@ -353,6 +353,7 @@ VkResult Pipeline::BuildShaderStageInfo(
                 result = pCompiler->BuildShaderModule(
                     pDevice,
                     flags,
+                    0,
                     codeSize,
                     pCode,
                     adaptForFastLink,
@@ -699,6 +700,7 @@ VkResult Pipeline::GetShaderDisassembly(
 
 // =====================================================================================================================
 PipelineBinaryInfo* PipelineBinaryInfo::Create(
+    Util::MetroHash::Hash        hash,
     size_t                       size,
     const void*                  pBinary,
     const VkAllocationCallbacks* pAllocator)
@@ -717,9 +719,9 @@ PipelineBinaryInfo* PipelineBinaryInfo::Create(
         {
             pInfo = VK_PLACEMENT_NEW(pStorage) PipelineBinaryInfo();
 
+            pInfo->binaryHash     = hash;
             pInfo->binaryByteSize = size;
             pInfo->pBinary        = Util::VoidPtrInc(pStorage, sizeof(PipelineBinaryInfo));
-
             memcpy(pInfo->pBinary, pBinary, size);
         }
     }
@@ -887,11 +889,12 @@ void Pipeline::ElfHashToCacheId(
     hasher.Update(pDevice->GetEnabledFeatures().nullDescriptorExtended);
 
 #if VKI_RAY_TRACING
-    // The AccelStructTracker enable status gets stored inside the ELF within
-    // the static GpuRT flags. Needed for both TraceRay() and RayQuery().
     if (pDevice->RayTrace() != nullptr)
     {
+        // The accel struct tracker enable and the trace ray counter states get stored inside the ELF within
+        // the static GpuRT flags. Needed for both TraceRay() and RayQuery().
         hasher.Update(pDevice->RayTrace()->AccelStructTrackerEnabled(deviceIdx));
+        hasher.Update(pDevice->RayTrace()->TraceRayCounterMode(deviceIdx));
     }
 #endif
 
diff --git a/icd/api/vk_pipeline_cache.cpp b/icd/api/vk_pipeline_cache.cpp
index 031ac318..0f9f38e6 100644
--- a/icd/api/vk_pipeline_cache.cpp
+++ b/icd/api/vk_pipeline_cache.cpp
@@ -39,28 +39,17 @@ namespace vk
 // =====================================================================================================================
 PipelineCache::PipelineCache(
     const Device*           pDevice,
-    ShaderCache*            pShaderCaches,
     PipelineBinaryCache*    pBinaryCache
     )
     :
     m_pDevice(pDevice),
-    m_shaderCaches{},
     m_pBinaryCache(pBinaryCache)
 {
-    for (uint32_t i = 0; i < pDevice->NumPalDevices(); ++i)
-    {
-        const auto& cache = pShaderCaches[i];
-        m_shaderCaches[i].Init(cache.GetCacheType(), cache.GetCachePtr());
-    }
 }
 
 // =====================================================================================================================
 PipelineCache::~PipelineCache()
 {
-    for (uint32_t i = 0; i < m_pDevice->NumPalDevices(); i++)
-    {
-        m_shaderCaches[i].Destroy(m_pDevice->GetCompiler(i));
-    }
 }
 
 // =====================================================================================================================
@@ -73,20 +62,9 @@ VkResult PipelineCache::Create(
     VkResult                result           = VK_SUCCESS;
     const RuntimeSettings&  settings         = pDevice->GetRuntimeSettings();
     uint32_t                numPalDevices    = pDevice->NumPalDevices();
-    bool                    useInitialData   = false;
-    size_t                  shaderCacheSize  = 0;
-    size_t                  pipelineCacheSize[MaxPalDevices];
 
     bool                    usePipelineCacheInitialData   = false;
 
-    PipelineCompilerType       cacheType = pDevice->GetCompiler(DefaultDeviceIndex)->GetShaderCacheType();
-
-    for (uint32_t i = 0; i < numPalDevices; i++)
-    {
-        pipelineCacheSize[i] = pDevice->GetCompiler(DefaultDeviceIndex)->GetShaderCacheSize(cacheType);
-        shaderCacheSize += pipelineCacheSize[i];
-    }
-
     if ((pCreateInfo->initialDataSize > 0) && settings.usePipelineCacheInitialData)
     {
         const PipelineCacheHeaderData* pHeader = static_cast<const PipelineCacheHeaderData*>(pCreateInfo->pInitialData);
@@ -113,21 +91,13 @@ VkResult PipelineCache::Create(
                     {
                         usePipelineCacheInitialData = true;
                     }
-                    else
-                    {
-                        auto pPrivateDataHeader = reinterpret_cast<const PipelineCachePrivateHeaderData*>(pData);
-                        if (pPrivateDataHeader->cacheType == cacheType)
-                        {
-                            useInitialData = true;
-                        }
-                    }
                 }
             }
         }
     }
 
     // Allocate system memory for all objects
-    const size_t objSize = sizeof(PipelineCache) + shaderCacheSize;
+    const size_t objSize = sizeof(PipelineCache);
     void* pMemory = pDevice->AllocApiObject(pAllocator, objSize);
 
     if (pMemory == nullptr)
@@ -136,102 +106,40 @@ VkResult PipelineCache::Create(
     }
     else
     {
-        const PipelineCachePrivateHeaderData* pPrivateDataHeader = nullptr;
-        const void* pBlobs[MaxPalDevices] = {};
-
-        if (useInitialData)
-        {
-            pPrivateDataHeader = reinterpret_cast<const PipelineCachePrivateHeaderData*>(
-                Util::VoidPtrInc(pCreateInfo->pInitialData, sizeof(PipelineCacheHeaderData)));
-
-            pBlobs[0] = Util::VoidPtrInc(pPrivateDataHeader, sizeof(PipelineCachePrivateHeaderData));
-            for (uint32_t i = 1; i < numPalDevices; i++)
-            {
-                pBlobs[i] = Util::VoidPtrInc(pBlobs[i - 1], static_cast<size_t>(pPrivateDataHeader->blobSize[i - 1]));
-            }
-        }
-
-        ShaderCache shaderCaches[MaxPalDevices];
-        size_t shaderCacheOffset = sizeof(PipelineCache);
         uint32_t expectedEntries = pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPipelineCacheExpectedEntryCount();
 
-        for (uint32_t i = 0; i < numPalDevices; i++)
+        PipelineBinaryCache* pBinaryCache = nullptr;
+        if (settings.allowExternalPipelineCacheObject)
         {
             const void* pInitialData = nullptr;
             size_t initialDataSize = 0;
 
-            if (useInitialData)
+            if (usePipelineCacheInitialData)
             {
-                pInitialData    = pBlobs[i];
-                initialDataSize = static_cast<size_t>(pPrivateDataHeader->blobSize[i]);
+                pInitialData    = Util::VoidPtrInc(pCreateInfo->pInitialData, sizeof(PipelineCacheHeaderData));
+                initialDataSize = pCreateInfo->initialDataSize - sizeof(PipelineCacheHeaderData);
             }
 
-            if (result == VK_SUCCESS)
-            {
-                result = pDevice->GetCompiler(DefaultDeviceIndex)->CreateShaderCache(
-                    pInitialData,
-                    initialDataSize,
-                    expectedEntries,
-                    Util::VoidPtrInc(pMemory, shaderCacheOffset),
-                    &shaderCaches[i]);
-            }
-            else
-            {
-                break;
-            }
-
-            // Move to next shader cache object
-            shaderCacheOffset += pipelineCacheSize[i];
-        }
-
-        // Something went wrong with creating the PAL object. Free memory
-        if (result != VK_SUCCESS)
-        {
-            for (uint32_t i = 0; i < numPalDevices; i++)
-            {
-                shaderCaches[i].Destroy(pDevice->GetCompiler(i));
-            }
-        }
-
-        if (result == VK_SUCCESS)
-        {
-            PipelineBinaryCache* pBinaryCache = nullptr;
-            if (settings.allowExternalPipelineCacheObject)
-            {
-                const void* pInitialData = nullptr;
-                size_t initialDataSize = 0;
-
-                if (usePipelineCacheInitialData)
-                {
-                    pInitialData    = Util::VoidPtrInc(pCreateInfo->pInitialData, sizeof(PipelineCacheHeaderData));
-                    initialDataSize = pCreateInfo->initialDataSize - sizeof(PipelineCacheHeaderData);
-                }
-
-                vk::PhysicalDevice* pDefaultPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex);
-                pBinaryCache = PipelineBinaryCache::Create(
-                    pDefaultPhysicalDevice->VkInstance()->GetAllocCallbacks(),
-                    pDefaultPhysicalDevice->GetPlatformKey(),
-                    pDevice->GetCompiler(DefaultDeviceIndex)->GetGfxIp(),
-                    pDefaultPhysicalDevice->GetRuntimeSettings(),
-                    pDefaultPhysicalDevice->PalDevice()->GetCacheFilePath(),
+            vk::PhysicalDevice* pDefaultPhysicalDevice = pDevice->VkPhysicalDevice(DefaultDeviceIndex);
+            pBinaryCache = PipelineBinaryCache::Create(
+                pDefaultPhysicalDevice->VkInstance()->GetAllocCallbacks(),
+                pDefaultPhysicalDevice->GetPlatformKey(),
+                pDevice->GetCompiler(DefaultDeviceIndex)->GetGfxIp(),
+                pDefaultPhysicalDevice->GetRuntimeSettings(),
+                pDefaultPhysicalDevice->PalDevice()->GetCacheFilePath(),
 #if ICD_GPUOPEN_DEVMODE_BUILD
-                    pDefaultPhysicalDevice->VkInstance()->GetDevModeMgr(),
+                pDefaultPhysicalDevice->VkInstance()->GetDevModeMgr(),
 #endif
-                    expectedEntries,
-                    initialDataSize,
-                    pInitialData,
-                    false);
+                expectedEntries,
+                initialDataSize,
+                pInitialData,
+                false);
 
-                // This isn't a terminal failure, the device can continue without the pipeline cache if need be.
-                VK_ALERT(pBinaryCache == nullptr);
-            }
-            PipelineCache* pCache = VK_PLACEMENT_NEW(pMemory) PipelineCache(pDevice, shaderCaches, pBinaryCache);
-            *pPipelineCache = PipelineCache::HandleFromVoidPointer(pMemory);
-        }
-        else
-        {
-            pDevice->FreeApiObject(pAllocator, pMemory);
+            // This isn't a terminal failure, the device can continue without the pipeline cache if need be.
+            VK_ALERT(pBinaryCache == nullptr);
         }
+        PipelineCache* pCache = VK_PLACEMENT_NEW(pMemory) PipelineCache(pDevice, pBinaryCache);
+        *pPipelineCache = PipelineCache::HandleFromVoidPointer(pMemory);
     }
 
     return result;
@@ -257,6 +165,7 @@ VkResult PipelineCache::Destroy(
 }
 
 // =====================================================================================================================
+// Stores AMD specific pipeline cache data to binary cache.
 VkResult PipelineCache::GetData(
     void*   pData,
     size_t* pSize)
@@ -271,43 +180,7 @@ VkResult PipelineCache::GetData(
     }
     else
     {
-        uint32_t numPalDevices = m_pDevice->NumPalDevices();
-
-        size_t allBlobSize = sizeof(PipelineCachePrivateHeaderData);
-        PipelineCachePrivateHeaderData headerData = {};
-
-        headerData.cacheType = m_shaderCaches[0].GetCacheType();
-        for (uint32_t i = 0; i < numPalDevices; i++)
-        {
-            size_t blobSize = 0;
-            result = m_shaderCaches[i].Serialize(nullptr, &blobSize);
-            VK_ASSERT(result == VK_SUCCESS);
-            headerData.blobSize[i] = blobSize;
-            allBlobSize += blobSize;
-        }
-
-        if (*pSize == 0)
-        {
-            *pSize = allBlobSize;
-        }
-        else
-        {
-            VK_ASSERT(*pSize >= allBlobSize);
-            memcpy(pData, &headerData, sizeof(headerData));
-
-            void* pBlob = Util::VoidPtrInc(pData, sizeof(headerData));
-
-            for (uint32_t i = 0; i < numPalDevices; i++)
-            {
-                size_t blobSize = static_cast<size_t>(headerData.blobSize[i]);
-                result = m_shaderCaches[i].Serialize(pBlob, &blobSize);
-                if (result != VK_SUCCESS)
-                {
-                    break;
-                }
-                pBlob = Util::VoidPtrInc(pBlob, blobSize);
-            }
-        }
+        *pSize = 0;
     }
 
     return result;
@@ -333,34 +206,6 @@ VkResult PipelineCache::Merge(
 
         result = m_pBinaryCache->Merge(srcCacheCount, &binaryCaches[0]);
     }
-    else
-    {
-        Util::AutoBuffer<ShaderCache::ShaderCachePtr, 16, PalAllocator> shaderCaches(
-            srcCacheCount * m_pDevice->NumPalDevices(),
-            m_pDevice->VkInstance()->Allocator());
-
-        for (uint32_t deviceIdx = 0; deviceIdx < m_pDevice->NumPalDevices(); deviceIdx++)
-        {
-            for (uint32_t cacheIdx = 0; cacheIdx < srcCacheCount; cacheIdx++)
-            {
-                VK_ASSERT(ppSrcCaches[cacheIdx]->GetShaderCache(deviceIdx).GetCacheType() ==
-                    GetShaderCache(deviceIdx).GetCacheType());
-                // Store all PAL caches like this d0c0,d0c1,d0c2...,d1c0,d1c2,d1c3...
-                shaderCaches[deviceIdx * srcCacheCount + cacheIdx] =
-                    ppSrcCaches[cacheIdx]->GetShaderCache(deviceIdx).GetCachePtr();
-            }
-        }
-
-        for (uint32_t i = 0; i < m_pDevice->NumPalDevices(); i++)
-        {
-            result = m_shaderCaches[i].Merge(srcCacheCount, &shaderCaches[i * srcCacheCount]);
-
-            if (result != VK_SUCCESS)
-            {
-                break;
-            }
-        }
-    }
 
     return result;
 }
diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp
index 5f7dae92..73da28a6 100644
--- a/icd/api/vk_pipeline_layout.cpp
+++ b/icd/api/vk_pipeline_layout.cpp
@@ -301,6 +301,40 @@ VkResult PipelineLayout::BuildCompactSchemeInfo(
 
     const uint32_t pushConstRegCount = pushConstantsSizeInBytes / sizeof(uint32_t);
 
+    uint32_t gfxReservedCount = 0;
+    // Reserve an user-data to store the VA of buffer for transform feedback.
+    if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK))
+    {
+        gfxReservedCount++;
+    }
+
+    if (pDevice->GetRuntimeSettings().enableDebugPrintf)
+    {
+        gfxReservedCount++;
+    }
+
+#if VKI_RAY_TRACING
+    if (HasRayTracing(pIn))
+    {
+        gfxReservedCount += (InternalConstBufferRegCount + MaxTraceRayUserDataRegCount);
+    }
+#endif
+
+    // the user data entries for uber-fetch shader const buffer
+    if (IsUberFetchShaderEnabled<PipelineLayoutScheme::Compact>(pDevice) &&
+        (settings.enableEarlyCompile == false))
+    {
+        gfxReservedCount += InternalConstBufferRegCount;
+    }
+
+    // Reseve PAL internal user data node for base vertex, base instance, draw id and lds_esgs_size.
+    gfxReservedCount += 4;
+
+    const uint32_t gfxInlinePushDescriptorUserDataLimit =
+        (settings.gfxInlinePushDescriptorUserDataLimit > gfxReservedCount) ?
+        settings.gfxInlinePushDescriptorUserDataLimit - gfxReservedCount :
+        0;
+
     // Populate user data layouts for each descriptor set that is active
     pUserDataLayout->setBindingRegBase = pInfo->userDataRegCount;
 
@@ -331,7 +365,8 @@ VkResult PipelineLayout::BuildCompactSchemeInfo(
             {
                 uint32_t regCountSpillLimit = (setLayoutInfo.activeStageMask == VK_SHADER_STAGE_COMPUTE_BIT) ?
                                                 settings.csInlinePushDescriptorUserDataLimit :
-                                                settings.gfxInlinePushDescriptorUserDataLimit;
+                                                gfxInlinePushDescriptorUserDataLimit;
+
                 uint32_t inlineRegCount     = pInfo->userDataRegCount +
                                               setLayoutInfo.sta.dwSize +
                                               pushConstRegCount;
diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp
index 5fafa58b..4f29ea61 100644
--- a/icd/api/vk_query.cpp
+++ b/icd/api/vk_query.cpp
@@ -280,29 +280,38 @@ VkResult PalQueryPool::GetResults(
         const uint32_t numXfbQueryDataElems = availability ? 3 : 2;
 
         // Vulkan supports 32-bit unsigned integer values data of transform feedback query, but Pal supports 64-bit only.
-        // So the query data is stored into xfbQueryData first.
-        Util::AutoBuffer<uint64_t, 4, PalAllocator> xfbQueryData(queryCount * numXfbQueryDataElems,
-                                                                 pDevice->VkInstance()->Allocator());
+        // So the query data is stored into pXfbQueryData first.
+        uint64_t* pXfbQueryData = nullptr;
 
         if (m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT)
         {
-            pQueryData      = &xfbQueryData[0];
-            queryDataStride = sizeof(uint64_t) * numXfbQueryDataElems;
-            queryDataSize   = sizeof(uint64_t) * numXfbQueryDataElems * queryCount;
+            queryDataStride  = sizeof(uint64_t) * numXfbQueryDataElems;
+            queryDataSize    = queryDataStride * queryCount;
             queryFlags      |= VK_QUERY_RESULT_64_BIT;
-        }
 
-        Pal::Result palResult = m_pPalQueryPool[DefaultDeviceIndex]->GetResults(
-            VkToPalQueryResultFlags(queryFlags),
-            m_palQueryType,
-            startQuery,
-            queryCount,
-            m_internalMem.CpuAddr(DefaultDeviceIndex),
-            &queryDataSize,
-            pQueryData,
-            static_cast<size_t>(queryDataStride));
+            pXfbQueryData    = static_cast<uint64_t*>(pDevice->VkInstance()->AllocMem(
+                queryDataSize, VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND));
+            if (pXfbQueryData == nullptr)
+            {
+                result = VK_ERROR_OUT_OF_HOST_MEMORY;
+            }
+            pQueryData       = pXfbQueryData;
+        }
 
-        result = PalToVkResult(palResult);
+        if (result == VK_SUCCESS)
+        {
+            Pal::Result palResult = m_pPalQueryPool[DefaultDeviceIndex]->GetResults(
+                VkToPalQueryResultFlags(queryFlags),
+                m_palQueryType,
+                startQuery,
+                queryCount,
+                m_internalMem.CpuAddr(DefaultDeviceIndex),
+                &queryDataSize,
+                pQueryData,
+                static_cast<size_t>(queryDataStride));
+
+            result = PalToVkResult(palResult);
+        }
 
         if ((m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT) &&
             ((result == VK_SUCCESS) || (result == VK_NOT_READY)))
@@ -311,7 +320,7 @@ VkResult PalQueryPool::GetResults(
 
             for (size_t i = 0; i < queryCount; i++)
             {
-                uint64_t* pXfbQueryData = static_cast<uint64_t*>(&xfbQueryData[i * numXfbQueryDataElems]);
+                uint64_t* pXfbQueryElem = &pXfbQueryData[i * numXfbQueryDataElems];
 
                 // The number of written primitives and the number of needed primitives are in reverse order in Pal.
                 if ((flags & VK_QUERY_RESULT_64_BIT) == 0)
@@ -320,14 +329,14 @@ VkResult PalQueryPool::GetResults(
 
                     if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
                     {
-                        pPrimitivesCount[0] = static_cast<uint32_t>(pXfbQueryData[1]);
-                        pPrimitivesCount[1] = static_cast<uint32_t>(pXfbQueryData[0]);
+                        pPrimitivesCount[0] = static_cast<uint32_t>(pXfbQueryElem[1]);
+                        pPrimitivesCount[1] = static_cast<uint32_t>(pXfbQueryElem[0]);
                     }
 
                     if (availability)
                     {
                         // Set the availability state to the last slot.
-                        pPrimitivesCount[2] = static_cast<uint32_t>(pXfbQueryData[2]);
+                        pPrimitivesCount[2] = static_cast<uint32_t>(pXfbQueryElem[2]);
                     }
                 }
                 else
@@ -336,19 +345,21 @@ VkResult PalQueryPool::GetResults(
 
                     if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
                     {
-                        pPrimitivesCount[0] = pXfbQueryData[1];
-                        pPrimitivesCount[1] = pXfbQueryData[0];
+                        pPrimitivesCount[0] = pXfbQueryElem[1];
+                        pPrimitivesCount[1] = pXfbQueryElem[0];
                     }
 
                     if (availability)
                     {
                         // Set the availability state to the last slot.
-                        pPrimitivesCount[2] = pXfbQueryData[2];
+                        pPrimitivesCount[2] = pXfbQueryElem[2];
                     }
                 }
 
                 pData = Util::VoidPtrInc(pData, static_cast<size_t>(stride));
             }
+
+            pDevice->VkInstance()->FreeMem(pXfbQueryData);
         }
     }
 
diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp
index fa1f4542..c07b09b1 100644
--- a/icd/api/vk_queue.cpp
+++ b/icd/api/vk_queue.cpp
@@ -1596,8 +1596,6 @@ VkResult Queue::Submit(
     return result;
 }
 
-// =====================================================================================================================
-
 // =====================================================================================================================
 // Wait for a queue to go idle
 VkResult Queue::WaitIdle(void)
diff --git a/icd/api/vk_shader.cpp b/icd/api/vk_shader.cpp
index c108d16e..db859aca 100644
--- a/icd/api/vk_shader.cpp
+++ b/icd/api/vk_shader.cpp
@@ -200,7 +200,7 @@ VkResult ShaderModule::Init(const Device* pDevice, VkShaderModuleCreateFlags fla
     PipelineCompiler* pCompiler = pDevice->GetCompiler(DefaultDeviceIndex);
 
     VkResult result = pCompiler->BuildShaderModule(
-        pDevice, flags, m_codeSize, m_pCode, false, false, nullptr, nullptr, &m_handle);
+        pDevice, flags, 0, m_codeSize, m_pCode, false, false, nullptr, nullptr, &m_handle);
 
     if (result == VK_SUCCESS)
     {
diff --git a/icd/api/vk_utils.cpp b/icd/api/vk_utils.cpp
index 7cea34ff..32bbe13b 100644
--- a/icd/api/vk_utils.cpp
+++ b/icd/api/vk_utils.cpp
@@ -45,6 +45,42 @@ uint32_t GetBuildTimeHash()
 }
 
 #if DEBUG
+// =====================================================================================================================
+// If turned on and exe name is a match, this function spins idle until we have a debugger hooked.
+void WaitIdleForDebugger(
+    bool        waitIdleToggled,
+    const char* pWaitIdleExeName,
+    uint32_t    debugTimeout)
+{
+    if (waitIdleToggled)
+    {
+        bool waitForDebugger = false;
+
+        if (strlen(pWaitIdleExeName) == 0)
+        {
+            // No executable name specified, apply on all Vulkan applications
+            waitForDebugger = true;
+        }
+        else
+        {
+            // Apply if executable name is a match
+            char appName[PATH_MAX];
+            char appPath[PATH_MAX];
+            utils::GetExecutableNameAndPath(appName, appPath);
+
+            waitForDebugger = strcmp(pWaitIdleExeName, &appName[0]) == 0;
+        }
+
+        if (waitForDebugger)
+        {
+            // Timeout the driver to give debuggers a chance to load all of the symbols
+            if (debugTimeout != 0)
+            {
+                Util::SleepMs(debugTimeout);
+            }
+        }
+    }
+}
 #endif
 
 } // namespace utils
diff --git a/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h b/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h
index 6e827ce8..0bb32392 100644
--- a/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h
+++ b/icd/imported/gputexdecoder/shaders/bc3-encode-hlsl/bcn_common_api.h
@@ -403,23 +403,6 @@ CMP_STATIC CGU_UINT32 cmp_clampui32(CMP_IN CGU_UINT32 v, CMP_IN CGU_UINT32 a, CM
     return v;
 }
 
-
-//# Half (in Hex)        Float               Comment
-//# ---------------------------------------------------------------------------
-//# 0001    (approx) = 0.000000059604645     smallest positive subnormal number
-//# 03ff    (approx) = 0.000060975552        largest subnormal number
-//# 0400    (approx) = 0.00006103515625      smallest positive normal number
-//# 7bff    (approx) = 65504                 largest normal number
-//# 3bff    (approx) = 0.99951172            largest number less than one
-//# 3c00    (approx) = 1.00097656            smallest number larger than one
-//# 3555             = 0.33325195            the rounding of 1/3 to nearest
-//# c000             = ?2
-//# 8000             = -0
-//# 0000             = 0
-//# 7c00             = infinity
-//# fc00             = infinity
-//# Half Float Math
-
 CMP_STATIC CGU_FLOAT HalfToFloat(CGU_UINT32 h)
 {
 #if defined(ASPM_GPU)
diff --git a/icd/layers/include/vk_layer_switchable_graphics.h b/icd/layers/include/vk_layer_switchable_graphics.h
index 0d122453..fcd0393e 100644
--- a/icd/layers/include/vk_layer_switchable_graphics.h
+++ b/icd/layers/include/vk_layer_switchable_graphics.h
@@ -52,7 +52,8 @@ struct NextLinkFuncPointers
     PFN_vkEnumeratePhysicalDeviceGroupsKHR      pfnEnumeratePhysicalDeviceGroupsKHR;
 };
 
-typedef Util::HashMap<VkInstance, NextLinkFuncPointers, vk::PalAllocator> DispatchTableHashMap;
+typedef Util::HashMap<VkInstance, NextLinkFuncPointers, vk::PalAllocator, Util::DefaultHashFunc, Util::DefaultEqualFunc,
+    Util::HashAllocator<vk::PalAllocator>, 256> DispatchTableHashMap;
 
 typedef VkResult (VKAPI_PTR *PFN_vkCreateInstance_SG)(
     const VkInstanceCreateInfo*                 pCreateInfo,
diff --git a/icd/res/ver.h b/icd/res/ver.h
index ef839867..0b234f03 100644
--- a/icd/res/ver.h
+++ b/icd/res/ver.h
@@ -36,7 +36,7 @@
 #define VERSION_MAJOR_STR           MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0"
 
 // Bump up after each promotion to mainline
-#define VULKAN_ICD_BUILD_VERSION   262
+#define VULKAN_ICD_BUILD_VERSION   267
 
 // String version is needed with leading zeros and extra termination (unicode)
 #define VERSION_NUMBER_MINOR        VULKAN_ICD_BUILD_VERSION
@@ -45,11 +45,11 @@
 // These values specify the driver ID and driver info string
 #define VULKAN_DRIVER_ID            VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR  // "AMDOPEN"
 #define VULKAN_DRIVER_NAME_STR      "AMD open-source driver"
-#define VULKAN_DRIVER_INFO_STR      "2023.Q2.1"
+#define VULKAN_DRIVER_INFO_STR      "2023.Q2.2"
 #define VULKAN_DRIVER_INFO_STR_LLPC "(LLPC)"
 
 // These values tell which version of the conformance test the driver is compliant against
 #define CTS_VERSION_MAJOR           1
 #define CTS_VERSION_MINOR           3
-#define CTS_VERSION_SUBMINOR        0
-#define CTS_VERSION_PATCH           0
+#define CTS_VERSION_SUBMINOR        3
+#define CTS_VERSION_PATCH           1
diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp
index 53eb6e2f..db12d65d 100644
--- a/icd/settings/settings.cpp
+++ b/icd/settings/settings.cpp
@@ -264,6 +264,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings(
 
         }
 
+        {
+            m_settings.disableImplicitInvariantExports = false;
+        }
+
 #if VKI_BUILD_GFX11
         if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0)
         {
@@ -1062,14 +1066,17 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings(
 #if VKI_BUILD_GFX11
             else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0)
             {
+                // Navi31 Mall and Tiling Settings
+                if (pInfo->revision == Pal::AsicRevision::Navi31)
+                {
+                    // Mall no alloc settings give a ~1% gain
+                    m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr;
+                    m_settings.mallNoAllocCtSsrPolicy = MallNoAllocCtSsrAsSnsr;
+                    m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr;
 
-                // Mall no alloc settings give a ~1% gain
-                m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr;
-                m_settings.mallNoAllocCtSsrPolicy = MallNoAllocCtSsrAsSnsr;
-                m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr;
-
-                // This provides ~6% gain at 4k
-                m_settings.imageTilingPreference3dGpuWritable = Pal::ImageTilingPattern::YMajor;
+                    // This provides ~6% gain at 4k
+                    m_settings.imageTilingPreference3dGpuWritable = Pal::ImageTilingPattern::YMajor;
+                }
             }
 #endif
         }
@@ -1254,6 +1261,21 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings(
 #endif
         }
 
+        if (appProfile == AppProfile::RomeRemastered)
+        {
+#if VKI_BUILD_GFX11
+            if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0)
+            {
+                pPalSettings->pwsMode = Pal::PwsMode::NoLateAcquirePoint;
+            }
+#endif
+        }
+
+        if (appProfile == AppProfile::Zink)
+        {
+            m_settings.padVertexBuffers = true;
+        }
+
         pAllocCb->pfnFree(pAllocCb->pUserData, pInfo);
     }
 
diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json
index 9ad3d91e..35d442fd 100644
--- a/icd/settings/settings_xgl.json
+++ b/icd/settings/settings_xgl.json
@@ -2305,6 +2305,18 @@
       "Scope": "Driver",
       "Type": "uint64"
     },
+    {
+      "Name": "DumpPipelineWithApiHash",
+      "Description": "Use PSO api hash as pipeline dump file name",
+      "Tags": [
+        "SPIRV Options"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Scope": "Driver",
+      "Type": "bool"
+    },
     {
       "Description": "If true, duplicate pipelines will be dumped to a file with a numeric suffix attached to the filename to distinguish each copy of the pipeline.",
       "Tags": [
@@ -3608,7 +3620,7 @@
         "VKI_RAY_TRACING"
       ],
       "Defaults": {
-        "Default": false
+        "Default": true
       },
       "Type": "bool",
       "VariableName": "rtEnableBuildParallel",
@@ -3646,6 +3658,21 @@
       "Scope": "Driver",
       "VariableName": "rtEnableAcquireReleaseInterface"
     },
+    {
+      "Name": "EnableFusedInstanceNode",
+      "Description": "Enable fused instance node for BVH builder",
+      "Tags": [
+        "Ray Tracing"
+      ],
+      "BuildTypes": [
+        "VKI_RAY_TRACING"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Type": "bool",
+      "Scope": "Driver"
+    },
     {
       "Name": "DispatchRaysThreadGroupSize",
       "Type": "uint32",
@@ -6617,6 +6644,18 @@
       "Type": "uint32",
       "Name": "CpDmaCmdCopyMemoryMaxBytes"
     },
+    {
+      "Description": "Disables all implicit invariant marking of exports, which in turn disables MUL/ADD -> FMA. This option is legal but may cause issues if applications are sensitive to FMA influencing some export results.",
+      "Tags": [
+        "Optimization"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Scope": "Driver",
+      "Type": "bool",
+      "Name": "DisableImplicitInvariantExports"
+    },
     {
       "Description": "This value denotes whether using CmdClearBoundAttachments/CmdClearBoundDepthStencilTargets for subpass load op clears or not.",
       "Tags": [
@@ -6786,7 +6825,7 @@
         "Optimization"
       ],
       "Defaults": {
-        "Default": 29
+        "Default": 30
       },
       "Scope": "Driver",
       "Type": "uint32",
@@ -8106,6 +8145,52 @@
       "VariableName": "rtMaxRayLength",
       "Scope": "Driver"
     },
+    {
+      "Description": "Used to wait idle on vkCreateInstance() call until a debugger is attached to running application. Effective only on Windows debug builds.",
+      "Tags": [
+        "Debugging"
+      ],
+      "BuildTypes": [
+        "DEBUG"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Type": "bool",
+      "Name": "WaitForDebugger",
+      "Scope": "Driver"
+    },
+    {
+      "Description": "Executable name of a Vulkan application (e.g. vkcube.exe) upon which to wait idle until a debugger is attached. If empty, it applies to all Vulkan applications",
+      "Tags": [
+        "Debugging"
+      ],
+      "BuildTypes": [
+        "DEBUG"
+      ],
+      "Defaults": {
+        "Default": ""
+      },
+      "Scope": "Driver",
+      "Name": "WaitForDebuggerExecutableName",
+      "Type": "string",
+      "Size": 256
+    },
+    {
+      "Description": "Timeout the driver in millisecond to give debuggers a chance to load all of the symbols",
+      "Tags": [
+        "Debugging"
+      ],
+      "BuildTypes": [
+        "DEBUG"
+      ],
+      "Defaults": {
+        "Default": 0
+      },
+      "Scope": "Driver",
+      "Name": "DebugTimeout",
+      "Type": "uint32"
+    },
     {
       "Description": "Enable printf debug functionality",
       "Tags": [