diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
index 605b0223..adb382d8 100644
--- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp
@@ -103,6 +103,11 @@ KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region,
     kmt_alloc_flags.ui32.NonPaged = 1;
   }
 
+  if (!m_region.IsLocalMemory() &&
+      (alloc_flags & core::MemoryRegion::AllocateMemoryOnly)) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
   // Allocating a memory handle for virtual memory
   kmt_alloc_flags.ui32.NoAddress =
       !!(alloc_flags & core::MemoryRegion::AllocateMemoryOnly);
diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
index 46a7ef5b..b9b62ad6 100644
--- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
+++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
@@ -136,6 +136,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
 
   amdxdna_drm_get_bo_info get_bo_info_args{0};
   drm_gem_close close_bo_args{0};
+  void *mapped_mem(nullptr);
 
   if (!m_region.IsSystem()) {
     return HSA_STATUS_ERROR_INVALID_REGION;
@@ -162,18 +163,29 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
     return HSA_STATUS_ERROR;
   }
 
+  /// TODO: For now we always map the memory and keep a mapping from handles
+  /// to VA memory addresses. Once we can support the separate VMEM call to
+  /// map handles we can fix this.
   if (m_region.kernarg()) {
-    *mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
-                get_bo_info_args.map_offset);
-    if (*mem == MAP_FAILED) {
+    mapped_mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
+                      get_bo_info_args.map_offset);
+    if (mapped_mem == MAP_FAILED) {
       // Close the BO in the case when a mapping fails and we got a BO handle.
       ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
   } else {
-    *mem = reinterpret_cast<void *>(get_bo_info_args.vaddr);
+    mapped_mem = reinterpret_cast<void *>(get_bo_info_args.vaddr);
   }
 
+  if (alloc_flags & core::MemoryRegion::AllocateMemoryOnly) {
+    *mem = reinterpret_cast<void *>(create_bo_args.handle);
+  } else {
+    *mem = mapped_mem;
+  }
+
+  vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem);
+
   return HSA_STATUS_SUCCESS;
 }
 
diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
index e08db0e6..e313eb4b 100644
--- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
+++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h
@@ -43,6 +43,7 @@
 #define HSA_RUNTIME_CORE_INC_AMD_XDNA_DRIVER_H_
 
 #include <memory>
+#include <unordered_map>
 
 #include "core/inc/driver.h"
 #include "core/inc/memory_region.h"
@@ -89,6 +90,12 @@ class XdnaDriver : public core::Driver {
   hsa_status_t InitDeviceHeap();
   hsa_status_t FreeDeviceHeap();
 
+  /// TODO: Remove this in the future and rely on the core Runtime
+  /// object to track handle allocations. Using the VMEM API for mapping XDNA
+  /// driver handles requires a bit more refactoring. So rely on the XDNA driver
+  /// to manage some of this for now.
+  std::unordered_map<uint32_t, void *> vmem_handle_mappings;
+
   /// @brief Virtual address range allocated for the device heap.
   ///
   /// Allocate a large enough space so we can carve out the device heap in
diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp
index e25cf556..ab53fbe9 100644
--- a/runtime/hsa-runtime/core/runtime/runtime.cpp
+++ b/runtime/hsa-runtime/core/runtime/runtime.cpp
@@ -3111,20 +3111,22 @@ hsa_status_t Runtime::VMemoryHandleCreate(const MemoryRegion* region, size_t siz
                                           uint64_t flags_unused,
                                           hsa_amd_vmem_alloc_handle_t* memoryOnlyHandle) {
   const AMD::MemoryRegion* memRegion = static_cast<const AMD::MemoryRegion*>(region);
-  if (!memRegion->IsLocalMemory()) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
   if (!IsMultipleOf(size, memRegion->GetPageSize()))
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
   ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
-  void* thunk_handle;
-  hsa_status_t status = region->Allocate(size, alloc_flags, &thunk_handle, 0);
+  void *user_mode_driver_handle;
+  hsa_status_t status =
+      region->Allocate(size, alloc_flags, &user_mode_driver_handle, 0);
   if (status == HSA_STATUS_SUCCESS) {
     memory_handle_map_.emplace(std::piecewise_construct,
-          std::forward_as_tuple(thunk_handle),
-          std::forward_as_tuple(region, size, flags_unused, thunk_handle, alloc_flags));
+                               std::forward_as_tuple(user_mode_driver_handle),
+                               std::forward_as_tuple(region, size, flags_unused,
+                                                     user_mode_driver_handle,
+                                                     alloc_flags));
 
-    *memoryOnlyHandle = MemoryHandle::Convert(thunk_handle);
+    *memoryOnlyHandle = MemoryHandle::Convert(user_mode_driver_handle);
   }
   return status;
 }