From 4576ff28926792dd37f11514b6d62be3196cba6a Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 30 Mar 2021 18:41:41 -0700 Subject: [PATCH] Implementing Vulkan dispatch tracing. This is heavily inspired by the upstream TracyVulkan.hpp and https://nikitablack.github.io/post/how_to_use_vulkan_timestamp_queries/. Significant reworking was done to better support incremental collection, use host query reset when available (extension or vulkan 1.2), and use external source locations so we can provide the original executable information. This is enabled by default when IREE tracing is enabled but can be turned off with `--vulkan_tracing=false`. Only tested on Windows - there may be some timestamp mapping required on linux/android where VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT is present. There are likely corner cases with exhausted buffers but since this is a debug-only codepath the worst that will happen is that the trace gets corrupted. --- iree/hal/vulkan/BUILD | 2 + iree/hal/vulkan/CMakeLists.txt | 2 + iree/hal/vulkan/api.h | 14 +- iree/hal/vulkan/command_queue.h | 16 +- iree/hal/vulkan/direct_command_buffer.cc | 27 + iree/hal/vulkan/direct_command_buffer.h | 2 + iree/hal/vulkan/direct_command_queue.cc | 8 +- iree/hal/vulkan/direct_command_queue.h | 2 +- iree/hal/vulkan/dynamic_symbol_tables.h | 15 +- iree/hal/vulkan/extensibility_util.cc | 12 + iree/hal/vulkan/extensibility_util.h | 4 + iree/hal/vulkan/native_executable.cc | 71 +- iree/hal/vulkan/native_executable.h | 12 + iree/hal/vulkan/registration/driver_module.cc | 6 + iree/hal/vulkan/serializing_command_queue.cc | 7 +- iree/hal/vulkan/serializing_command_queue.h | 2 +- iree/hal/vulkan/tracing.cc | 648 ++++++++++++++++++ iree/hal/vulkan/tracing.h | 179 +++++ iree/hal/vulkan/vulkan_device.cc | 158 ++++- 19 files changed, 1119 insertions(+), 68 deletions(-) create mode 100644 iree/hal/vulkan/tracing.cc create mode 100644 iree/hal/vulkan/tracing.h diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD index 5a21c011a647..f76bd3f9e1bf 100644 --- a/iree/hal/vulkan/BUILD +++ b/iree/hal/vulkan/BUILD @@ -72,6 +72,8 @@ cc_library( "status_util.h", "timepoint_util.cc", "timepoint_util.h", + "tracing.cc", + "tracing.h", "vma_allocator.cc", "vma_allocator.h", "vma_buffer.cc", diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt index ee165a9f7589..7a567dd9cb5f 100644 --- a/iree/hal/vulkan/CMakeLists.txt +++ b/iree/hal/vulkan/CMakeLists.txt @@ -61,6 +61,8 @@ iree_cc_library( "status_util.h" "timepoint_util.cc" "timepoint_util.h" + "tracing.cc" + "tracing.h" "vma_allocator.cc" "vma_allocator.h" "vma_buffer.cc" diff --git a/iree/hal/vulkan/api.h b/iree/hal/vulkan/api.h index 3f5912ae295e..35b1c2c8ad08 100644 --- a/iree/hal/vulkan/api.h +++ b/iree/hal/vulkan/api.h @@ -37,10 +37,20 @@ extern "C" { enum iree_hal_vulkan_feature_e { // Use VK_LAYER_KHRONOS_standard_validation to validate Vulkan API usage. // Has a significant performance penalty and is *not* a security mechanism. - IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS = 1 << 0, + IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS = 1u << 0, // Use VK_EXT_debug_utils, record markers, and log errors. - IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS = 1 << 1, + IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS = 1u << 1, + + // Enables tracing of command buffers when IREE tracing is enabled. + // May take advantage of additional extensions for more accurate timing or + // hardware-specific performance counters. + // + // NOTE: tracing has a non-trivial overhead and will skew the timing of + // submissions and introduce false barriers between dispatches. Use this to + // identify slow dispatches and refine from there; be wary of whole-program + // tracing with this enabled. + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING = 1u << 2, }; typedef uint64_t iree_hal_vulkan_features_t; diff --git a/iree/hal/vulkan/command_queue.h b/iree/hal/vulkan/command_queue.h index 69486b838d82..ac0fff307580 100644 --- a/iree/hal/vulkan/command_queue.h +++ b/iree/hal/vulkan/command_queue.h @@ -22,6 +22,7 @@ #include "iree/hal/api.h" #include "iree/hal/vulkan/dynamic_symbols.h" #include "iree/hal/vulkan/handle_util.h" +#include "iree/hal/vulkan/tracing.h" #include "iree/hal/vulkan/util/arena.h" namespace iree { @@ -42,6 +43,15 @@ class CommandQueue { return logical_device_->syms(); } + VkQueue handle() const { return queue_; } + + iree_hal_vulkan_tracing_context_t* tracing_context() { + return tracing_context_; + } + void set_tracing_context(iree_hal_vulkan_tracing_context_t* tracing_context) { + tracing_context_ = tracing_context; + } + bool can_dispatch() const { return iree_all_bits_set(supported_categories_, IREE_HAL_COMMAND_CATEGORY_DISPATCH); @@ -52,19 +62,19 @@ class CommandQueue { virtual iree_status_t WaitIdle(iree_time_t deadline_ns) = 0; protected: - CommandQueue(VkDeviceHandle* logical_device, std::string name, + CommandQueue(VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue) : logical_device_(logical_device), - name_(std::move(name)), supported_categories_(supported_categories), queue_(queue) { iree_slim_mutex_initialize(&queue_mutex_); } VkDeviceHandle* logical_device_; - const std::string name_; const iree_hal_command_category_t supported_categories_; + iree_hal_vulkan_tracing_context_t* tracing_context_ = nullptr; + // VkQueue needs to be externally synchronized. iree_slim_mutex_t queue_mutex_; VkQueue queue_ IREE_GUARDED_BY(queue_mutex_); diff --git a/iree/hal/vulkan/direct_command_buffer.cc b/iree/hal/vulkan/direct_command_buffer.cc index ca16c8a1c53b..610eed040b80 100644 --- a/iree/hal/vulkan/direct_command_buffer.cc +++ b/iree/hal/vulkan/direct_command_buffer.cc @@ -36,6 +36,7 @@ typedef struct { iree_hal_command_buffer_mode_t mode; iree_hal_command_category_t allowed_categories; iree_hal_queue_affinity_t queue_affinity; + iree_hal_vulkan_tracing_context_t* tracing_context; VkCommandPoolHandle* command_pool; VkCommandBuffer handle; @@ -68,6 +69,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, + iree_hal_vulkan_tracing_context_t* tracing_context, iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache, iree_hal_command_buffer_t** out_command_buffer) { IREE_ASSERT_ARGUMENT(logical_device); @@ -98,6 +100,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( command_buffer->mode = mode; command_buffer->allowed_categories = command_categories; command_buffer->queue_affinity = queue_affinity; + command_buffer->tracing_context = tracing_context; command_buffer->command_pool = command_pool; command_buffer->handle = handle; command_buffer->syms = logical_device->syms().get(); @@ -564,6 +567,15 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch( iree_hal_vulkan_direct_command_buffer_t* command_buffer = iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer); + iree_hal_vulkan_source_location_t source_location; + iree_hal_vulkan_native_executable_entry_point_source_location( + executable, entry_point, &source_location); + IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( + command_buffer->tracing_context, command_buffer->handle, + source_location.file_name.data, source_location.file_name.size, + source_location.line, source_location.func_name.data, + source_location.func_name.size, NULL, 0); + // Get the compiled and linked pipeline for the specified entry point and // bind it to the command buffer. VkPipeline pipeline_handle = VK_NULL_HANDLE; @@ -576,6 +588,9 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch( command_buffer->syms->vkCmdDispatch(command_buffer->handle, workgroup_x, workgroup_y, workgroup_z); + IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context, + command_buffer->handle); + return iree_ok_status(); } @@ -587,6 +602,15 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch_indirect( iree_hal_vulkan_direct_command_buffer_t* command_buffer = iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer); + iree_hal_vulkan_source_location_t source_location; + iree_hal_vulkan_native_executable_entry_point_source_location( + executable, entry_point, &source_location); + IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( + command_buffer->tracing_context, command_buffer->handle, + source_location.file_name.data, source_location.file_name.size, + source_location.line, source_location.func_name.data, + source_location.func_name.size, NULL, 0); + // Get the compiled and linked pipeline for the specified entry point and // bind it to the command buffer. VkPipeline pipeline_handle = VK_NULL_HANDLE; @@ -602,6 +626,9 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch_indirect( command_buffer->syms->vkCmdDispatchIndirect( command_buffer->handle, workgroups_device_buffer, workgroups_offset); + IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context, + command_buffer->handle); + return iree_ok_status(); } diff --git a/iree/hal/vulkan/direct_command_buffer.h b/iree/hal/vulkan/direct_command_buffer.h index 858b521e97a4..18ef97939e66 100644 --- a/iree/hal/vulkan/direct_command_buffer.h +++ b/iree/hal/vulkan/direct_command_buffer.h @@ -18,6 +18,7 @@ #include "iree/hal/api.h" #include "iree/hal/vulkan/descriptor_pool_cache.h" #include "iree/hal/vulkan/handle_util.h" +#include "iree/hal/vulkan/tracing.h" #ifdef __cplusplus extern "C" { @@ -30,6 +31,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, + iree_hal_vulkan_tracing_context_t* tracing_context, iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache, iree_hal_command_buffer_t** out_command_buffer); diff --git a/iree/hal/vulkan/direct_command_queue.cc b/iree/hal/vulkan/direct_command_queue.cc index 461ce9aa59ab..4b78f4dbc7da 100644 --- a/iree/hal/vulkan/direct_command_queue.cc +++ b/iree/hal/vulkan/direct_command_queue.cc @@ -26,10 +26,9 @@ namespace hal { namespace vulkan { DirectCommandQueue::DirectCommandQueue( - VkDeviceHandle* logical_device, std::string name, + VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue) - : CommandQueue(logical_device, std::move(name), supported_categories, - queue) {} + : CommandQueue(logical_device, supported_categories, queue) {} DirectCommandQueue::~DirectCommandQueue() = default; @@ -134,6 +133,7 @@ iree_status_t DirectCommandQueue::WaitIdle(iree_time_t deadline_ns) { iree_status_t status = VK_RESULT_TO_STATUS(syms()->vkQueueWaitIdle(queue_), "vkQueueWaitIdle"); iree_slim_mutex_unlock(&queue_mutex_); + iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE); return status; } @@ -191,6 +191,8 @@ iree_status_t DirectCommandQueue::WaitIdle(iree_time_t deadline_ns) { syms()->vkDestroyFence(*logical_device_, fence, logical_device_->allocator()); + iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE); + return status; } diff --git a/iree/hal/vulkan/direct_command_queue.h b/iree/hal/vulkan/direct_command_queue.h index ad36aadcc58b..fc770ed1fe5f 100644 --- a/iree/hal/vulkan/direct_command_queue.h +++ b/iree/hal/vulkan/direct_command_queue.h @@ -25,7 +25,7 @@ namespace vulkan { // Command queue implementation directly maps to VkQueue. class DirectCommandQueue final : public CommandQueue { public: - DirectCommandQueue(VkDeviceHandle* logical_device, std::string name, + DirectCommandQueue(VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue); ~DirectCommandQueue() override; diff --git a/iree/hal/vulkan/dynamic_symbol_tables.h b/iree/hal/vulkan/dynamic_symbol_tables.h index 05dcd591a1ea..a6a5282069e3 100644 --- a/iree/hal/vulkan/dynamic_symbol_tables.h +++ b/iree/hal/vulkan/dynamic_symbol_tables.h @@ -113,7 +113,7 @@ namespace vulkan { DEV_PFN(EXCLUDED, vkCmdPushDescriptorSetWithTemplateKHR) \ DEV_PFN(EXCLUDED, vkCmdReserveSpaceForCommandsNVX) \ DEV_PFN(REQUIRED, vkCmdResetEvent) \ - DEV_PFN(EXCLUDED, vkCmdResetQueryPool) \ + DEV_PFN(REQUIRED, vkCmdResetQueryPool) \ DEV_PFN(EXCLUDED, vkCmdResolveImage) \ DEV_PFN(EXCLUDED, vkCmdSetBlendConstants) \ DEV_PFN(EXCLUDED, vkCmdSetCheckpointNV) \ @@ -174,7 +174,7 @@ namespace vulkan { DEV_PFN(EXCLUDED, vkCreateObjectTableNVX) \ DEV_PFN(REQUIRED, vkCreatePipelineCache) \ DEV_PFN(REQUIRED, vkCreatePipelineLayout) \ - DEV_PFN(EXCLUDED, vkCreateQueryPool) \ + DEV_PFN(REQUIRED, vkCreateQueryPool) \ DEV_PFN(EXCLUDED, vkCreateRayTracingPipelinesNV) \ DEV_PFN(EXCLUDED, vkCreateRenderPass) \ DEV_PFN(EXCLUDED, vkCreateRenderPass2KHR) \ @@ -207,7 +207,7 @@ namespace vulkan { DEV_PFN(REQUIRED, vkDestroyPipeline) \ DEV_PFN(REQUIRED, vkDestroyPipelineCache) \ DEV_PFN(REQUIRED, vkDestroyPipelineLayout) \ - DEV_PFN(EXCLUDED, vkDestroyQueryPool) \ + DEV_PFN(REQUIRED, vkDestroyQueryPool) \ DEV_PFN(EXCLUDED, vkDestroyRenderPass) \ DEV_PFN(EXCLUDED, vkDestroySampler) \ DEV_PFN(EXCLUDED, vkDestroySamplerYcbcrConversion) \ @@ -228,7 +228,7 @@ namespace vulkan { DEV_PFN(REQUIRED, vkGetBufferMemoryRequirements) \ DEV_PFN(EXCLUDED, vkGetBufferMemoryRequirements2) \ DEV_PFN(EXCLUDED, vkGetBufferMemoryRequirements2KHR) \ - DEV_PFN(EXCLUDED, vkGetCalibratedTimestampsEXT) \ + DEV_PFN(OPTIONAL, vkGetCalibratedTimestampsEXT) \ DEV_PFN(EXCLUDED, vkGetDescriptorSetLayoutSupport) \ DEV_PFN(EXCLUDED, vkGetDescriptorSetLayoutSupportKHR) \ DEV_PFN(EXCLUDED, vkGetDeviceGroupPeerMemoryFeatures) \ @@ -255,7 +255,7 @@ namespace vulkan { DEV_PFN(EXCLUDED, vkGetMemoryHostPointerPropertiesEXT) \ DEV_PFN(EXCLUDED, vkGetPastPresentationTimingGOOGLE) \ DEV_PFN(REQUIRED, vkGetPipelineCacheData) \ - DEV_PFN(EXCLUDED, vkGetQueryPoolResults) \ + DEV_PFN(REQUIRED, vkGetQueryPoolResults) \ DEV_PFN(EXCLUDED, vkGetRayTracingShaderGroupHandlesNV) \ DEV_PFN(EXCLUDED, vkGetRefreshCycleDurationGOOGLE) \ DEV_PFN(EXCLUDED, vkGetRenderAreaGranularity) \ @@ -278,7 +278,8 @@ namespace vulkan { DEV_PFN(REQUIRED, vkResetDescriptorPool) \ DEV_PFN(REQUIRED, vkResetEvent) \ DEV_PFN(REQUIRED, vkResetFences) \ - DEV_PFN(EXCLUDED, vkResetQueryPoolEXT) \ + DEV_PFN(OPTIONAL, vkResetQueryPool) \ + DEV_PFN(OPTIONAL, vkResetQueryPoolEXT) \ DEV_PFN(OPTIONAL, vkSetDebugUtilsObjectNameEXT) \ DEV_PFN(OPTIONAL, vkSetDebugUtilsObjectTagEXT) \ DEV_PFN(REQUIRED, vkSetEvent) \ @@ -322,7 +323,7 @@ namespace vulkan { INS_PFN(EXCLUDED, vkGetDisplayPlaneCapabilities2KHR) \ INS_PFN(EXCLUDED, vkGetDisplayPlaneCapabilitiesKHR) \ INS_PFN(EXCLUDED, vkGetDisplayPlaneSupportedDisplaysKHR) \ - INS_PFN(EXCLUDED, vkGetPhysicalDeviceCalibrateableTimeDomainsEXT) \ + INS_PFN(OPTIONAL, vkGetPhysicalDeviceCalibrateableTimeDomainsEXT) \ INS_PFN(EXCLUDED, vkGetPhysicalDeviceCooperativeMatrixPropertiesNV) \ INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayPlaneProperties2KHR) \ INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayPlanePropertiesKHR) \ diff --git a/iree/hal/vulkan/extensibility_util.cc b/iree/hal/vulkan/extensibility_util.cc index 7320cd34c225..0bd9805976ee 100644 --- a/iree/hal/vulkan/extensibility_util.cc +++ b/iree/hal/vulkan/extensibility_util.cc @@ -206,6 +206,12 @@ iree_hal_vulkan_populate_enabled_device_extensions( } else if (strcmp(extension_name, VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME) == 0) { extensions.timeline_semaphore = true; + } else if (strcmp(extension_name, VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME) == + 0) { + extensions.host_query_reset = true; + } else if (strcmp(extension_name, + VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0) { + extensions.calibrated_timestamps = true; } } return extensions; @@ -222,5 +228,11 @@ iree_hal_vulkan_infer_enabled_device_extensions( if (device_syms->vkSignalSemaphore || device_syms->vkSignalSemaphoreKHR) { extensions.timeline_semaphore = true; } + if (device_syms->vkResetQueryPoolEXT) { + extensions.host_query_reset = true; + } + if (device_syms->vkGetCalibratedTimestampsEXT) { + extensions.calibrated_timestamps = true; + } return extensions; } diff --git a/iree/hal/vulkan/extensibility_util.h b/iree/hal/vulkan/extensibility_util.h index 3468794debbc..7f398f1263a0 100644 --- a/iree/hal/vulkan/extensibility_util.h +++ b/iree/hal/vulkan/extensibility_util.h @@ -83,6 +83,10 @@ typedef struct { bool push_descriptors : 1; // VK_KHR_timeline_semaphore is enabled. bool timeline_semaphore : 1; + // VK_EXT_host_query_reset is enabled. + bool host_query_reset : 1; + // VK_EXT_calibrated_timestamps is enabled. + bool calibrated_timestamps : 1; } iree_hal_vulkan_device_extensions_t; // Returns a bitfield with all of the provided extension names. diff --git a/iree/hal/vulkan/native_executable.cc b/iree/hal/vulkan/native_executable.cc index 9cd7ae05ce72..0b93ad3792c8 100644 --- a/iree/hal/vulkan/native_executable.cc +++ b/iree/hal/vulkan/native_executable.cc @@ -26,6 +26,11 @@ using namespace iree::hal::vulkan; +typedef struct { + VkPipeline pipeline; + iree_string_view_t name; +} iree_hal_vulkan_entry_point_t; + static iree_status_t iree_hal_vulkan_create_shader_module( VkDeviceHandle* logical_device, iree_const_byte_span_t code, VkShaderModule* out_shader_module) { @@ -55,7 +60,8 @@ static iree_status_t iree_hal_vulkan_create_pipelines( iree_SpirVExecutableDef_table_t executable_def, VkShaderModule shader_module, iree_host_size_t executable_layout_count, iree_hal_executable_layout_t* const* executable_layouts, - iree_host_size_t pipeline_count, VkPipeline* out_pipelines) { + iree_host_size_t pipeline_count, + iree_hal_vulkan_entry_point_t* out_entry_points) { VkComputePipelineCreateInfo* create_infos = NULL; IREE_RETURN_IF_ERROR(iree_allocator_malloc( logical_device->host_allocator(), @@ -96,11 +102,18 @@ static iree_status_t iree_hal_vulkan_create_pipelines( stage_create_info->pSpecializationInfo = NULL; } + VkPipeline* pipelines = + (VkPipeline*)iree_alloca(pipeline_count * sizeof(VkPipeline)); iree_status_t status = VK_RESULT_TO_STATUS( logical_device->syms()->vkCreateComputePipelines( *logical_device, pipeline_cache, (uint32_t)pipeline_count, - create_infos, logical_device->allocator(), out_pipelines), + create_infos, logical_device->allocator(), pipelines), "vkCreateComputePipelines"); + if (iree_status_is_ok(status)) { + for (iree_host_size_t i = 0; i < pipeline_count; ++i) { + out_entry_points[i].pipeline = pipelines[i]; + } + } iree_allocator_free(logical_device->host_allocator(), create_infos); return status; @@ -179,8 +192,8 @@ static iree_status_t iree_hal_spirv_executable_flatbuffer_verify( typedef struct { iree_hal_resource_t resource; VkDeviceHandle* logical_device; - iree_host_size_t pipeline_count; - VkPipeline pipelines[]; + iree_host_size_t entry_point_count; + iree_hal_vulkan_entry_point_t entry_points[]; } iree_hal_vulkan_native_executable_t; extern const iree_hal_executable_vtable_t @@ -226,31 +239,43 @@ iree_status_t iree_hal_vulkan_native_executable_create( // Create pipelines for each entry point. flatbuffers_string_vec_t entry_points_vec = iree_SpirVExecutableDef_entry_points_get(executable_def); - iree_host_size_t pipeline_count = + iree_host_size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); iree_hal_vulkan_native_executable_t* executable = NULL; iree_host_size_t total_size = - sizeof(*executable) + pipeline_count * sizeof(*executable->pipelines); + sizeof(*executable) + + entry_point_count * sizeof(*executable->entry_points); iree_status_t status = iree_allocator_malloc(logical_device->host_allocator(), total_size, (void**)&executable); if (iree_status_is_ok(status)) { iree_hal_resource_initialize(&iree_hal_vulkan_native_executable_vtable, &executable->resource); executable->logical_device = logical_device; - executable->pipeline_count = pipeline_count; - memset(executable->pipelines, 0, - pipeline_count * sizeof(*executable->pipelines)); + executable->entry_point_count = entry_point_count; + memset(executable->entry_points, 0, + entry_point_count * sizeof(*executable->entry_points)); } if (iree_status_is_ok(status)) { status = iree_hal_vulkan_create_pipelines( logical_device, pipeline_cache, executable_spec->caching_mode, executable_def, shader_module, executable_spec->executable_layout_count, - executable_spec->executable_layouts, executable->pipeline_count, - executable->pipelines); + executable_spec->executable_layouts, executable->entry_point_count, + executable->entry_points); } iree_hal_vulkan_destroy_shader_module(logical_device, shader_module); + if (iree_status_is_ok(status)) { + flatbuffers_string_vec_t entry_points_vec = + iree_SpirVExecutableDef_entry_points_get(executable_def); + for (iree_host_size_t i = 0; i < entry_point_count; ++i) { + flatbuffers_string_t name = + flatbuffers_string_vec_at(entry_points_vec, i); + executable->entry_points[i].name = + iree_make_string_view(name, flatbuffers_string_len(name)); + } + } + if (iree_status_is_ok(status)) { *out_executable = (iree_hal_executable_t*)executable; } else { @@ -269,25 +294,41 @@ static void iree_hal_vulkan_native_executable_destroy( executable->logical_device->host_allocator(); IREE_TRACE_ZONE_BEGIN(z0); - for (iree_host_size_t i = 0; i < executable->pipeline_count; ++i) { + for (iree_host_size_t i = 0; i < executable->entry_point_count; ++i) { iree_hal_vulkan_destroy_pipeline(executable->logical_device, - executable->pipelines[i]); + executable->entry_points[i].pipeline); } iree_allocator_free(host_allocator, executable); IREE_TRACE_ZONE_END(z0); } +void iree_hal_vulkan_native_executable_entry_point_source_location( + iree_hal_executable_t* base_executable, iree_host_size_t entry_ordinal, + iree_hal_vulkan_source_location_t* out_source_location) { + iree_hal_vulkan_native_executable_t* executable = + iree_hal_vulkan_native_executable_cast(base_executable); + memset(out_source_location, 0, sizeof(*out_source_location)); + if (entry_ordinal >= executable->entry_point_count) { + return; + } + out_source_location->func_name = executable->entry_points[entry_ordinal].name; + + // TODO(benvanik): plumb through file name/line for the MLIR function. + out_source_location->file_name = out_source_location->func_name; + out_source_location->line = 0; +} + iree_status_t iree_hal_vulkan_native_executable_pipeline_for_entry_point( iree_hal_executable_t* base_executable, iree_host_size_t entry_ordinal, VkPipeline* out_pipeline_handle) { iree_hal_vulkan_native_executable_t* executable = iree_hal_vulkan_native_executable_cast(base_executable); - if (entry_ordinal >= executable->pipeline_count) { + if (entry_ordinal >= executable->entry_point_count) { return iree_make_status(IREE_STATUS_OUT_OF_RANGE, "invalid entry point ordinal %zu", entry_ordinal); } - *out_pipeline_handle = executable->pipelines[entry_ordinal]; + *out_pipeline_handle = executable->entry_points[entry_ordinal].pipeline; return iree_ok_status(); } diff --git a/iree/hal/vulkan/native_executable.h b/iree/hal/vulkan/native_executable.h index 0f7eed7df0c8..ab1222315c67 100644 --- a/iree/hal/vulkan/native_executable.h +++ b/iree/hal/vulkan/native_executable.h @@ -26,6 +26,12 @@ extern "C" { #endif // __cplusplus +typedef struct { + iree_string_view_t file_name; + int line; + iree_string_view_t func_name; +} iree_hal_vulkan_source_location_t; + // Creates a wrapper for one or more VkPipelines that are sourced from the same // IREE executable. Each of the pipelines will share the same shader module // and just differs by the entry point into the shader module they reference. @@ -35,6 +41,12 @@ iree_status_t iree_hal_vulkan_native_executable_create( const iree_hal_executable_spec_t* executable_spec, iree_hal_executable_t** out_executable); +// Returns the source location for the given entry point. May be empty if not +// available. +void iree_hal_vulkan_native_executable_entry_point_source_location( + iree_hal_executable_t* executable, iree_host_size_t entry_ordinal, + iree_hal_vulkan_source_location_t* out_source_location); + // Returns the cached VkPipeline for the given executable |entry_ordinal|. iree_status_t iree_hal_vulkan_native_executable_pipeline_for_entry_point( iree_hal_executable_t* executable, iree_host_size_t entry_ordinal, diff --git a/iree/hal/vulkan/registration/driver_module.cc b/iree/hal/vulkan/registration/driver_module.cc index 76c228fa2361..761943b3fecb 100644 --- a/iree/hal/vulkan/registration/driver_module.cc +++ b/iree/hal/vulkan/registration/driver_module.cc @@ -35,6 +35,9 @@ ABSL_FLAG(int, vulkan_default_index, 0, "Index of the default Vulkan device."); ABSL_FLAG(bool, vulkan_force_timeline_semaphore_emulation, false, "Uses timeline semaphore emulation even if native support exists."); +ABSL_FLAG(bool, vulkan_tracing, true, + "Enables Vulkan tracing (if IREE tracing is enabled)."); + static iree_status_t iree_hal_vulkan_create_driver_with_flags( iree_string_view_t identifier, iree_allocator_t allocator, iree_hal_driver_t** out_driver) { @@ -63,6 +66,9 @@ static iree_status_t iree_hal_vulkan_create_driver_with_flags( driver_options.requested_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS; } + if (absl::GetFlag(FLAGS_vulkan_tracing)) { + driver_options.requested_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING; + } driver_options.default_device_index = absl::GetFlag(FLAGS_vulkan_default_index); diff --git a/iree/hal/vulkan/serializing_command_queue.cc b/iree/hal/vulkan/serializing_command_queue.cc index 7b6732c4ce24..f03888529417 100644 --- a/iree/hal/vulkan/serializing_command_queue.cc +++ b/iree/hal/vulkan/serializing_command_queue.cc @@ -161,11 +161,10 @@ void PrepareSubmitInfo(absl::Span wait_semaphore_handles, } // namespace SerializingCommandQueue::SerializingCommandQueue( - VkDeviceHandle* logical_device, std::string name, + VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue, TimePointFencePool* fence_pool) - : CommandQueue(logical_device, std::move(name), supported_categories, - queue), + : CommandQueue(logical_device, supported_categories, queue), fence_pool_(fence_pool) {} SerializingCommandQueue::~SerializingCommandQueue() = default; @@ -314,6 +313,8 @@ iree_status_t SerializingCommandQueue::WaitIdle(iree_time_t deadline_ns) { } iree_slim_mutex_unlock(&queue_mutex_); + + iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE); return status; } diff --git a/iree/hal/vulkan/serializing_command_queue.h b/iree/hal/vulkan/serializing_command_queue.h index 3137a1d15d47..9de2c0a8d21c 100644 --- a/iree/hal/vulkan/serializing_command_queue.h +++ b/iree/hal/vulkan/serializing_command_queue.h @@ -52,7 +52,7 @@ using SemaphoreValue = std::pair; // the GPU. class SerializingCommandQueue final : public CommandQueue { public: - SerializingCommandQueue(VkDeviceHandle* logical_device, std::string name, + SerializingCommandQueue(VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue, TimePointFencePool* fence_pool); ~SerializingCommandQueue() override; diff --git a/iree/hal/vulkan/tracing.cc b/iree/hal/vulkan/tracing.cc new file mode 100644 index 000000000000..a5de314a5d41 --- /dev/null +++ b/iree/hal/vulkan/tracing.cc @@ -0,0 +1,648 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "iree/hal/vulkan/tracing.h" + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +#include "iree/base/api.h" +#include "iree/base/internal/debugging.h" +#include "iree/base/target_platform.h" +#include "third_party/tracy/Tracy.hpp" +#include "third_party/tracy/client/TracyProfiler.hpp" + +// Total number of queries the per-queue query pool will contain. This +// translates to the maximum number of outstanding queries before collection is +// required. +#define IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY (64 * 1024) + +// Total number of queries that can be read back from the API in a single +// collection. +#define IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY (8 * 1024) + +// Number of times we will query the max_deviation from calibrated timestamps. +// The more we do the better confidence we have in a lower-bound. +#define IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT 32 + +typedef struct { + uint64_t timestamp; + uint64_t availability; // non-zero if available +} iree_hal_vulkan_timestamp_query_t; + +struct iree_hal_vulkan_tracing_context_s { + // Device and queue the context represents. + iree::hal::vulkan::VkDeviceHandle* logical_device; + VkQueue queue; + iree_allocator_t host_allocator; + + // Maintenance queue that supports dispatch commands and can be used to reset + // queries. + VkQueue maintenance_dispatch_queue; + // Command pool that serves command buffers compatible with the + // |maintenance_dispatch_queue|. + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool; + + // A unique GPU zone ID allocated from Tracy. + // There is a global limit of 255 GPU zones (ID 255 is special). + uint8_t id; + + // Defines how the timestamps are interpreted (device-specific, posix, QPC). + // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimeDomainEXT.html + VkTimeDomainEXT time_domain; + + // Maximum expected deviation between CPU and GPU timestamps based on an + // average computed at startup. Calibration events that exceed this value are + // discarded. + uint64_t max_expected_deviation; + + // Vulkan-reported CPU timestamp of the last calibration. + // Used to detect when drift occurs and we need to notify tracy. + uint64_t previous_cpu_time; + + // Pool of query instances that we treat as a backing store for a ringbuffer. + VkQueryPool query_pool; + + // Indices into |query_pool| defining a ringbuffer. + uint32_t query_head; + uint32_t query_tail; + uint32_t query_capacity; + + // Readback storage; large enough to get a decent chunk of queries back from + // the API in one shot. + // + // Data is stored as [[timestamp, availability], ...]. + // Availability will be non-zero if the timestamp is valid. Since we put all + // timestamps in order once we reach an unavailable timestamp we can bail + // and leave that for future collections. + iree_hal_vulkan_timestamp_query_t + readback_buffer[IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY]; +}; + +// Allocates and begins a command buffer and returns its handle. +// Returns VK_NULL_HANDLE if allocation fails. +static VkCommandBuffer iree_hal_vulkan_tracing_begin_command_buffer( + iree_hal_vulkan_tracing_context_t* context) { + const auto& syms = context->logical_device->syms(); + + VkCommandBufferAllocateInfo command_buffer_info; + memset(&command_buffer_info, 0, sizeof(command_buffer_info)); + command_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + command_buffer_info.commandPool = *context->maintenance_command_pool; + command_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + command_buffer_info.commandBufferCount = 1; + VkCommandBuffer command_buffer = VK_NULL_HANDLE; + IREE_IGNORE_ERROR(context->maintenance_command_pool->Allocate( + &command_buffer_info, &command_buffer)); + if (!command_buffer) return VK_NULL_HANDLE; + + VkCommandBufferBeginInfo begin_info; + memset(&begin_info, 0, sizeof(begin_info)); + begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + syms->vkBeginCommandBuffer(command_buffer, &begin_info); + + return command_buffer; +} + +// Ends and submits |command_buffer| and waits for it to complete. +static void iree_hal_vulkan_tracing_submit_command_buffer( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) { + const auto& syms = context->logical_device->syms(); + + syms->vkEndCommandBuffer(command_buffer); + + VkSubmitInfo submit_info; + memset(&submit_info, 0, sizeof(submit_info)); + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &command_buffer; + syms->vkQueueSubmit(context->maintenance_dispatch_queue, 1, &submit_info, + VK_NULL_HANDLE); + syms->vkQueueWaitIdle(context->maintenance_dispatch_queue); + + context->maintenance_command_pool->Free(command_buffer); +} + +// Synchronously resets a range of querys in a query pool. +// This may submit commands to the queue. +static void iree_hal_vulkan_tracing_reset_query_pool( + iree_hal_vulkan_tracing_context_t* context, uint32_t query_index, + uint32_t query_count) { + const auto& syms = context->logical_device->syms(); + + // Fast-path for when host-side vkResetQueryPool is available. + // This is core in Vulkan 1.2. + if (context->logical_device->enabled_extensions().host_query_reset) { + PFN_vkResetQueryPool vkResetQueryPool_fn = syms->vkResetQueryPool + ? syms->vkResetQueryPool + : syms->vkResetQueryPoolEXT; + if (vkResetQueryPool_fn != NULL) { + vkResetQueryPool_fn(*context->logical_device, context->query_pool, + query_index, query_count); + return; + } + } + + // Slow-path submitting a command buffer to reset the query pool. It's obvious + // why vkResetQueryPool was added :) + VkCommandBuffer command_buffer = + iree_hal_vulkan_tracing_begin_command_buffer(context); + if (command_buffer != VK_NULL_HANDLE) { + syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_index, + query_count); + iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer); + } +} + +// Attempts to get a timestamp from both the CPU and GPU that are correlated +// with each other. Only valid when calibration is supported. +static void iree_hal_vulkan_tracing_query_calibration_timestamps( + iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time, + uint64_t* out_gpu_time) { + *out_cpu_time = 0; + *out_gpu_time = 0; + + VkCalibratedTimestampInfoEXT timestamp_infos[2]; + timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[0].pNext = NULL; + timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT; + timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[1].pNext = NULL; + timestamp_infos[1].timeDomain = context->time_domain; + uint64_t timestamps[2] = {0, 0}; + uint64_t max_deviation = 0; + do { + context->logical_device->syms()->vkGetCalibratedTimestampsEXT( + *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos, + timestamps, &max_deviation); + } while (max_deviation > context->max_expected_deviation); + + *out_gpu_time = timestamps[0]; + *out_cpu_time = timestamps[1]; + switch (context->time_domain) { +#if defined(IREE_PLATFORM_WINDOWS) + case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT: + *out_cpu_time *= (uint64_t)(1000000000.0 / tracy::GetFrequencyQpc()); + break; +#else + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT: + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT: + // TODO(benvanik): posix calibrated timestamps - ignored for now. + break; +#endif // IREE_PLATFORM_WINDOWS + } +} + +// Populates |out_cpu_time| and |out_gpu_time| with calibrated timestamps. +// Depending on whether VK_EXT_calibrated_timestamps is available this may be +// a guess done by ourselves (with lots of slop) or done by the driver (with +// less slop). +static void iree_hal_vulkan_tracing_perform_initial_calibration( + iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time, + uint64_t* out_gpu_time) { + const auto& syms = context->logical_device->syms(); + *out_cpu_time = 0; + *out_gpu_time = 0; + + // Attempt to get a timestamp from both the device and the host at roughly the + // same time. There's a gap between when we get control returned to use after + // submitting and waiting for idle and that will be the slop we have in the + // timings in the tracy UI. + if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) { + // Submit a device timestamp. + VkCommandBuffer command_buffer = + iree_hal_vulkan_tracing_begin_command_buffer(context); + if (command_buffer != VK_NULL_HANDLE) { + syms->vkCmdWriteTimestamp(command_buffer, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + context->query_pool, 0); + iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer); + } + + // Query the timestamp from the host and the device. + *out_cpu_time = tracy::Profiler::GetTime(); + syms->vkGetQueryPoolResults( + *context->logical_device, context->query_pool, 0, 1, + sizeof(*out_gpu_time), out_gpu_time, sizeof(*out_gpu_time), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + + // Reset the query used. + iree_hal_vulkan_tracing_reset_query_pool(context, 0, 1); + return; + } + + // From the spec: + // The maximum deviation may vary between calls to + // vkGetCalibratedTimestampsEXT even for the same set of time domains due to + // implementation and platform specific reasons. It is the application’s + // responsibility to assess whether the returned maximum deviation makes the + // timestamp values suitable for any particular purpose and can choose to + // re-issue the timestamp calibration call pursuing a lower devation value. + // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/vkGetCalibratedTimestampsEXT.html + // + // We perform a small number of queries here and find the minimum deviation + // across all of them to get an average lower bound on the maximum deviation + // from any particular query. We then use that as our baseline (plus some + // slop) to see if calibration events in the future are reasonable. + VkCalibratedTimestampInfoEXT timestamp_infos[2]; + timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[0].pNext = NULL; + timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT; + timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[1].pNext = NULL; + timestamp_infos[1].timeDomain = context->time_domain; + uint64_t max_deviations[IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT]; + for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(max_deviations); ++i) { + uint64_t timestamps[2] = {0, 0}; + syms->vkGetCalibratedTimestampsEXT( + *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos, + timestamps, &max_deviations[i]); + } + uint64_t min_deviation = max_deviations[0]; + for (iree_host_size_t i = 1; i < IREE_ARRAYSIZE(max_deviations); ++i) { + min_deviation = iree_min(min_deviation, max_deviations[i]); + } + context->max_expected_deviation = min_deviation * 3 / 2; + + iree_hal_vulkan_tracing_query_calibration_timestamps( + context, &context->previous_cpu_time, out_gpu_time); + *out_cpu_time = tracy::Profiler::GetTime(); +} + +// Performs a periodic calibration (if supported) and sends the data to tracy. +// Over time the host and device clocks may drift (especially with power events) +// and by frequently performing this we ensure that the samples we are sending +// to tracy are able to be correlated. +void iree_hal_vulkan_tracing_perform_calibration( + iree_hal_vulkan_tracing_context_t* context) { + if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) return; + + uint64_t cpu_time = 0; + uint64_t gpu_time = 0; + iree_hal_vulkan_tracing_query_calibration_timestamps(context, &cpu_time, + &gpu_time); + + uint64_t tracy_time = tracy::Profiler::GetTime(); + if (cpu_time > context->previous_cpu_time) { + uint64_t cpu_delta = cpu_time - context->previous_cpu_time; + context->previous_cpu_time = cpu_time; + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuCalibration); + tracy::MemWrite(&item->gpuCalibration.gpuTime, gpu_time); + tracy::MemWrite(&item->gpuCalibration.cpuTime, tracy_time); + tracy::MemWrite(&item->gpuCalibration.cpuDelta, cpu_delta); + tracy::MemWrite(&item->gpuCalibration.context, context->id); + tracy::Profiler::QueueSerialFinish(); + } +} + +// Prepares the VkQueryPool backing storage for our query ringbuffer. +static void iree_hal_vulkan_tracing_prepare_query_pool( + iree_hal_vulkan_tracing_context_t* context) { + // Create a query pool with the largest query capacity it can provide. + VkQueryPoolCreateInfo pool_info; + memset(&pool_info, 0, sizeof(pool_info)); + pool_info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + pool_info.queryCount = IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY; + pool_info.queryType = VK_QUERY_TYPE_TIMESTAMP; + while (context->logical_device->syms()->vkCreateQueryPool( + *context->logical_device, &pool_info, + context->logical_device->allocator(), + &context->query_pool) != VK_SUCCESS) { + pool_info.queryCount /= 2; + } + context->query_capacity = pool_info.queryCount; + + // Perform initial reset of the query pool. All queries must be reset upon + // creation before first use. + iree_hal_vulkan_tracing_reset_query_pool(context, 0, context->query_capacity); +} + +// Prepares the Tracy-related GPU context that events are fed into. Each context +// will appear as a unique plot in the tracy UI with the given |queue_name|. +static void iree_hal_vulkan_tracing_prepare_gpu_context( + iree_hal_vulkan_tracing_context_t* context, + VkPhysicalDevice physical_device, iree_string_view_t queue_name) { + // Allocate the process-unique GPU context ID. There's a max of 255 available; + // if we are recreating devices a lot we may exceed that. Don't do that, or + // wrap around and get weird (but probably still usable) numbers. + context->id = + tracy::GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed); + if (context->id >= 255) { + context->id %= 255; + } + + // The number of nanoseconds required for a timestamp query to be incremented + // by 1. + VkPhysicalDeviceProperties device_properties; + context->logical_device->syms()->vkGetPhysicalDeviceProperties( + physical_device, &device_properties); + float timestamp_period = device_properties.limits.timestampPeriod; + + // Perform initial calibration for tracy to be able to correlate timestamps + // between CPU and GPU. + uint64_t cpu_time = 0; + uint64_t gpu_time = 0; + iree_hal_vulkan_tracing_perform_initial_calibration(context, &cpu_time, + &gpu_time); + + uint8_t context_flags = 0; + if (context->time_domain != VK_TIME_DOMAIN_DEVICE_EXT) { + // Tell tracy we'll be passing calibrated timestamps and not to mess with + // the times. We'll periodically send GpuCalibration events in case the + // times drift. + context_flags |= tracy::GpuContextCalibration; + } + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuNewContext); + tracy::MemWrite(&item->gpuNewContext.cpuTime, cpu_time); + tracy::MemWrite(&item->gpuNewContext.gpuTime, gpu_time); + memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); + tracy::MemWrite(&item->gpuNewContext.period, timestamp_period); + tracy::MemWrite(&item->gpuNewContext.context, context->id); + tracy::MemWrite(&item->gpuNewContext.flags, context_flags); + tracy::MemWrite(&item->gpuNewContext.type, tracy::GpuContextType::Vulkan); + tracy::Profiler::QueueSerialFinish(); + } + + // Send the name of the context along. + // NOTE: we intentionally leak the name here as tracy needs a pointer that + // survives until process exit (in case TRACY_NO_EXIT is set and the app waits + // in exit() for the profiler to attach). + IREE_LEAK_CHECK_DISABLE_PUSH(); + char* cloned_name = (char*)malloc(queue_name.size); + memcpy(cloned_name, queue_name.data, queue_name.size); + IREE_LEAK_CHECK_DISABLE_POP(); + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuContextName); + tracy::MemWrite(&item->gpuContextNameFat.context, context->id); + tracy::MemWrite(&item->gpuContextNameFat.ptr, (uint64_t)cloned_name); + tracy::MemWrite(&item->gpuContextNameFat.size, queue_name.size); + tracy::Profiler::QueueSerialFinish(); + } +} + +// Returns the best possible platform-supported time domain, falling back to +// VK_TIME_DOMAIN_DEVICE_EXT. By default it is one that is only usable for +// device-relative calculations and that we need to perform our own hacky +// calibration on. +static VkTimeDomainEXT iree_hal_vulkan_tracing_query_time_domain( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device) { + if (!logical_device->enabled_extensions().calibrated_timestamps) { + // Calibrated timestamps extension is not available; we'll only have the + // device domain. + return VK_TIME_DOMAIN_DEVICE_EXT; + } + + uint32_t time_domain_count = 0; + if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( + physical_device, &time_domain_count, NULL) != VK_SUCCESS) { + return VK_TIME_DOMAIN_DEVICE_EXT; + } + VkTimeDomainEXT* time_domains = (VkTimeDomainEXT*)iree_alloca( + time_domain_count * sizeof(VkTimeDomainEXT)); + if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( + physical_device, &time_domain_count, time_domains) != VK_SUCCESS) { + return VK_TIME_DOMAIN_DEVICE_EXT; + } + + for (uint32_t i = 0; i < time_domain_count; i++) { + switch (time_domains[i]) { +#if defined(IREE_PLATFORM_WINDOWS) + case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT: + return time_domains[i]; +#else + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT: + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT: + // TODO(benvanik): support posix clock domains with some kind of math. + // return time_domains[i]; -- ignored +#endif // IREE_PLATFORM_WINDOWS + default: + continue; + } + } + return VK_TIME_DOMAIN_DEVICE_EXT; +} + +iree_status_t iree_hal_vulkan_tracing_context_allocate( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, + iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue, + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool, + iree_allocator_t host_allocator, + iree_hal_vulkan_tracing_context_t** out_context) { + IREE_TRACE_ZONE_BEGIN(z0); + IREE_ASSERT_ARGUMENT(logical_device); + IREE_ASSERT_ARGUMENT(out_context); + *out_context = NULL; + + iree_hal_vulkan_tracing_context_t* context = NULL; + iree_status_t status = + iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context); + if (iree_status_is_ok(status)) { + context->logical_device = logical_device; + context->queue = queue; + context->host_allocator = host_allocator; + context->time_domain = iree_hal_vulkan_tracing_query_time_domain( + physical_device, logical_device); + context->maintenance_dispatch_queue = maintenance_dispatch_queue; + context->maintenance_command_pool = maintenance_command_pool; + + // Prepare the query pool and perform the initial calibration. + iree_hal_vulkan_tracing_prepare_query_pool(context); + + // Prepare the Tracy GPU context. + iree_hal_vulkan_tracing_prepare_gpu_context(context, physical_device, + queue_name); + } + + if (iree_status_is_ok(status)) { + *out_context = context; + } else { + iree_hal_vulkan_tracing_context_free(context); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +void iree_hal_vulkan_tracing_context_free( + iree_hal_vulkan_tracing_context_t* context) { + if (!context) return; + IREE_TRACE_ZONE_BEGIN(z0); + + if (context->query_pool != VK_NULL_HANDLE) { + // Always perform a collection on shutdown. + iree_hal_vulkan_tracing_context_collect(context, VK_NULL_HANDLE); + + auto* logical_device = context->logical_device; + logical_device->syms()->vkDestroyQueryPool( + *logical_device, context->query_pool, logical_device->allocator()); + } + + iree_allocator_t host_allocator = context->host_allocator; + iree_allocator_free(host_allocator, context); + + IREE_TRACE_ZONE_END(z0); +} + +uint32_t iree_hal_vulkan_tracing_context_acquire_query_id( + iree_hal_vulkan_tracing_context_t* context) { + uint32_t id = context->query_head; + context->query_head = (context->query_head + 1) % context->query_capacity; + assert(context->query_head != context->query_tail); + return id; +} + +void iree_hal_vulkan_tracing_context_collect( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) { + if (!context) return; + if (context->query_tail == context->query_head) { + // No outstanding queries. + return; + } + IREE_TRACE_ZONE_BEGIN(z0); + const auto& syms = context->logical_device->syms(); + + while (context->query_tail != context->query_head) { + // Compute the contiguous range of queries ready to be read. + // If the ringbuffer wraps around we'll handle that in the next loop. + uint32_t try_query_count = + context->query_head < context->query_tail + ? context->query_capacity - context->query_tail + : context->query_head - context->query_tail; + try_query_count = iree_min(try_query_count, + IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY); + + // Read back all of the queries. Note that we also are reading back the + // availability such that we can handle partial readiness of the outstanding + // range of queries. + uint32_t query_base = context->query_tail; + if (syms->vkGetQueryPoolResults( + *context->logical_device, context->query_pool, query_base, + try_query_count, sizeof(context->readback_buffer), + context->readback_buffer, sizeof(iree_hal_vulkan_timestamp_query_t), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) != + VK_SUCCESS) { + break; + } + + // Scan and feed the times to tracy, stopping when we hit the first + // unavailable query. + uint32_t read_query_count = 0; + for (uint32_t i = 0; i < try_query_count; ++i) { + if (context->readback_buffer[i].availability == 0) break; + read_query_count = i + 1; + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuTime); + tracy::MemWrite(&item->gpuTime.gpuTime, + context->readback_buffer[i].timestamp); + tracy::MemWrite(&item->gpuTime.queryId, (uint16_t)(query_base + i)); + tracy::MemWrite(&item->gpuTime.context, context->id); + tracy::Profiler::QueueSerialFinish(); + } + + // Reset the range of queries read back. + if (command_buffer != VK_NULL_HANDLE) { + syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_base, + read_query_count); + } else { + iree_hal_vulkan_tracing_reset_query_pool(context, query_base, + read_query_count); + } + + context->query_tail += read_query_count; + if (context->query_tail >= context->query_capacity) { + context->query_tail = 0; + } + } + + // Run calibration - we could do this less frequently in cases where collect + // is called every submission, however it's relatively cheap compared to all + // this other tracing overhead. + iree_hal_vulkan_tracing_perform_calibration(context); + + IREE_TRACE_ZONE_END(z0); +} + +void iree_hal_vulkan_tracing_zone_begin_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const iree_tracing_location_t* src_loc) { + if (!context) return; + + uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); + context->logical_device->syms()->vkCmdWriteTimestamp( + command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, + query_id); + + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneBeginSerial); + tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime()); + tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc); + tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle()); + tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id); + tracy::MemWrite(&item->gpuZoneBegin.context, context->id); + tracy::Profiler::QueueSerialFinish(); +} + +void iree_hal_vulkan_tracing_zone_begin_external_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length) { + if (!context) return; + + uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); + context->logical_device->syms()->vkCmdWriteTimestamp( + command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, + query_id); + + const auto src_loc = tracy::Profiler::AllocSourceLocation( + line, file_name, file_name_length, function_name, function_name_length, + name, name_length); + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, + tracy::QueueType::GpuZoneBeginAllocSrcLocSerial); + tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime()); + tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc); + tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle()); + tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id); + tracy::MemWrite(&item->gpuZoneBegin.context, context->id); + tracy::Profiler::QueueSerialFinish(); +} + +void iree_hal_vulkan_tracing_zone_end_impl( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) { + if (!context) return; + + uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); + context->logical_device->syms()->vkCmdWriteTimestamp( + command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, + query_id); + + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneEndSerial); + tracy::MemWrite(&item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime()); + tracy::MemWrite(&item->gpuZoneEnd.thread, tracy::GetThreadHandle()); + tracy::MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)query_id); + tracy::MemWrite(&item->gpuZoneEnd.context, context->id); + tracy::Profiler::QueueSerialFinish(); +} + +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION diff --git a/iree/hal/vulkan/tracing.h b/iree/hal/vulkan/tracing.h new file mode 100644 index 000000000000..f637b702b1a3 --- /dev/null +++ b/iree/hal/vulkan/tracing.h @@ -0,0 +1,179 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef IREE_HAL_VULKAN_TRACING_H_ +#define IREE_HAL_VULKAN_TRACING_H_ + +// clang-format off: Must be included before all other headers: +#include "iree/hal/vulkan/vulkan_headers.h" +// clang-format on + +#include "iree/base/tracing.h" +#include "iree/hal/vulkan/handle_util.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Per-queue Vulkan tracing context. +// No-op if IREE tracing is not enabled. +// +// Use the IREE_VULKAN_TRACE_* macros to trace a contiguous set of command +// buffer operations. Unlike the normal tracy macros there are no zone IDs and +// instead each queue gets an ID allocated once and passed to all tracing +// macros. +// +// Usage: +// IREE_VULKAN_TRACE_ZONE_BEGIN(device->tracing_context, command_buffer); +// vkCmdDispatch(command_buffer, ...); +// IREE_VULKAN_TRACE_ZONE_END(queue->tracing_context, command_buffer); +// ... +// iree_hal_vulkan_tracing_context_collect(queue->tracing_context, +// command_buffer); +// vkQueueSubmit(...command_buffer...); +// +// NOTE: timestamps have non-trivial side-effecting behavior on the device: +// inserting a timestamp is in the worst (and average) case just as bad as +// inserting a full global execution barrier. If two command buffer operations +// that could overlap (no barrier between them) have tracing zones placed around +// them they will execute sequentially. +// +// TODO(benvanik): +// Each queue needs a context and maintains its own query pool. In the future +// this should be changed to have a single query pool per device to reduce +// bookkeeping overhead. +// +// TODO(benvanik): +// Both a zone begin and zone end always insert timestamps leading to N*2 +// total queries, however within command buffers the end of one zone and the +// begin of another share the same point in time. By inserting the timestamps +// at barriers in the command buffer the query count can be reduced to N+1. +// +// TODO(benvanik): +// vkCmdCopyQueryPoolResults is really what we should be using to do this - +// that inserts a device-side transfer to a buffer (conceptually) that is +// in-stream with all submissions to a queue. This changes things to a push +// model vs. the pull one in _collect and allows us to pipeline the readbacks. +// Instead of being limited to the query pool slots we'd only be limited by +// the size of the buffer the copy targets allowing us to perform collection +// much more infrequently. +// +// Thread-compatible: external synchronization is required if using from +// multiple threads (same as with VkQueue itself). +typedef struct iree_hal_vulkan_tracing_context_s + iree_hal_vulkan_tracing_context_t; + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +// Allocates a tracing context for the given Vulkan queue. +// Each context must only be used with the queue it was created with. +// +// |maintenance_dispatch_queue| may be used to perform query pool maintenance +// tasks and must support graphics or compute commands. +iree_status_t iree_hal_vulkan_tracing_context_allocate( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, + iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue, + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool, + iree_allocator_t host_allocator, + iree_hal_vulkan_tracing_context_t** out_context); + +// Frees a tracing context and all associated Vulkan resources. +// All submissions using the resources must be completed prior to calling. +void iree_hal_vulkan_tracing_context_free( + iree_hal_vulkan_tracing_context_t* context); + +// Collects in-flight timestamp queries from the queue and feeds them to tracy. +// Must be called frequently (every submission, etc) to drain the backlog; +// tracing may start failing if the internal ringbuffer is exceeded. +// +// The provided |command_buffer| may receive additional bookkeeping commands +// that should have no impact on correctness or behavior. If VK_NULL_HANDLE is +// provided then collection will occur synchronously. +void iree_hal_vulkan_tracing_context_collect( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer); + +// Begins a normal zone derived on the calling |src_loc|. +// Must be perfectly nested and paired with a corresponding zone end. +void iree_hal_vulkan_tracing_zone_begin_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const iree_tracing_location_t* src_loc); + +// Begins an external zone using the given source information. +// The provided strings will be copied into the tracy buffer. +void iree_hal_vulkan_tracing_zone_begin_external_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length); + +void iree_hal_vulkan_tracing_zone_end_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer); + +// Begins a new zone with the parent function name. +#define IREE_VULKAN_TRACE_ZONE_BEGIN(context, command_buffer) \ + static const iree_tracing_location_t TracyConcat( \ + __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__, \ + __FILE__, (uint32_t)__LINE__, 0}; \ + iree_hal_vulkan_tracing_zone_begin_impl( \ + context, command_buffer, \ + &TracyConcat(__tracy_source_location, __LINE__)); + +// Begins an externally defined zone with a dynamic source location. +// The |file_name|, |function_name|, and optional |name| strings will be copied +// into the trace buffer and do not need to persist. +#define IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, command_buffer, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) \ + iree_hal_vulkan_tracing_zone_begin_external_impl( \ + context, command_buffer, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) + +// Ends the current zone. Must be passed the |zone_id| from the _BEGIN. +#define IREE_VULKAN_TRACE_ZONE_END(context, command_buffer) \ + iree_hal_vulkan_tracing_zone_end_impl(context, command_buffer) + +#else + +inline iree_status_t iree_hal_vulkan_tracing_context_allocate( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, + iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue, + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool, + iree_allocator_t host_allocator, + iree_hal_vulkan_tracing_context_t** out_context) { + *out_context = NULL; + return iree_ok_status(); +} + +inline void iree_hal_vulkan_tracing_context_free( + iree_hal_vulkan_tracing_context_t* context) {} + +inline void iree_hal_vulkan_tracing_context_collect( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) {} + +#define IREE_VULKAN_TRACE_ZONE_BEGIN(context, command_buffer) +#define IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, command_buffer, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) +#define IREE_VULKAN_TRACE_ZONE_END(context, command_buffer) + +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_VULKAN_TRACING_H_ diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc index 32f14b404321..9cf8b3ac9b6f 100644 --- a/iree/hal/vulkan/vulkan_device.cc +++ b/iree/hal/vulkan/vulkan_device.cc @@ -39,6 +39,7 @@ #include "iree/hal/vulkan/nop_executable_cache.h" #include "iree/hal/vulkan/serializing_command_queue.h" #include "iree/hal/vulkan/status_util.h" +#include "iree/hal/vulkan/tracing.h" #include "iree/hal/vulkan/vma_allocator.h" using namespace iree::hal::vulkan; @@ -135,6 +136,26 @@ iree_hal_vulkan_query_extensibility_set( VK_EXT_DEBUG_UTILS_EXTENSION_NAME); } +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + if (iree_all_bits_set(requested_features, + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) { + // VK_EXT_host_query_reset: + // optionally allows for vkResetQueryPool to be used to reset query pools + // from the host without needing to do an expensive vkCmdResetQueryPool + // submission. + ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL, + VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME); + + // VK_EXT_calibrated_timestamps: + // optionally provides more accurate timestamps that correspond to the + // system time. If this is not present then tracy will attempt calibration + // itself and have some per-run variance in the skew (up to many + // milliseconds). + ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL, + VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME); + } +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + *out_string_count = string_count; return status; } @@ -338,6 +359,9 @@ typedef struct { iree_host_size_t transfer_queue_count; CommandQueue** transfer_queues; + // |queue_count| tracing contexts, if tracing is enabled. + iree_hal_vulkan_tracing_context_t** queue_tracing_contexts; + DescriptorPoolCache* descriptor_pool_cache; VkCommandPoolHandle* dispatch_command_pool; @@ -395,65 +419,110 @@ static CommandQueue* iree_hal_vulkan_device_create_queue( VkQueue queue = VK_NULL_HANDLE; logical_device->syms()->vkGetDeviceQueue(*logical_device, queue_family_index, queue_index, &queue); - std::string queue_name; - if (!iree_all_bits_set(command_category, - IREE_HAL_COMMAND_CATEGORY_DISPATCH)) { - queue_name = "q(t):"; - } else { - queue_name = "q(d):"; - } - queue_name += std::to_string(queue_index); // When emulating timeline semaphores we use a special queue that allows us to // sequence the semaphores correctly. if (fence_pool != NULL) { - return new SerializingCommandQueue(logical_device, std::move(queue_name), - command_category, queue, fence_pool); + return new SerializingCommandQueue(logical_device, command_category, queue, + fence_pool); } - return new DirectCommandQueue(logical_device, std::move(queue_name), - command_category, queue); + return new DirectCommandQueue(logical_device, command_category, queue); } // Creates command queues for the given sets of queues and populates the // device queue lists. -static void iree_hal_vulkan_device_initialize_command_queues( - iree_hal_vulkan_device_t* device, iree_string_view_t queue_prefix, +static iree_status_t iree_hal_vulkan_device_initialize_command_queues( + iree_hal_vulkan_device_t* device, + iree_hal_vulkan_features_t enabled_features, + iree_string_view_t queue_prefix, const iree_hal_vulkan_queue_set_t* compute_queue_set, const iree_hal_vulkan_queue_set_t* transfer_queue_set) { device->queue_count = 0; device->dispatch_queue_count = 0; device->transfer_queue_count = 0; + // The first available queue supporting dispatch commands that will be used by + // the tracing subsystem for query and cleanup tasks. + VkQueue maintenance_dispatch_queue = VK_NULL_HANDLE; + uint64_t compute_queue_count = iree_math_count_ones_u64(compute_queue_set->queue_indices); uint64_t transfer_queue_count = iree_math_count_ones_u64(transfer_queue_set->queue_indices); for (iree_host_size_t i = 0; i < compute_queue_count; ++i) { if (!(compute_queue_set->queue_indices & (1ull << i))) continue; + + char queue_name_buffer[32]; + int queue_name_length = + snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer), + "Vulkan[%c:%d]", 'D', (int)device->dispatch_queue_count); + iree_string_view_t queue_name = + iree_make_string_view(queue_name_buffer, queue_name_length); + CommandQueue* queue = iree_hal_vulkan_device_create_queue( device->logical_device, IREE_HAL_COMMAND_CATEGORY_ANY, compute_queue_set->queue_family_index, i, device->fence_pool); - device->queues[device->queue_count++] = queue; + + iree_host_size_t queue_index = device->queue_count++; + device->queues[queue_index] = queue; device->dispatch_queues[device->dispatch_queue_count++] = queue; + if (!transfer_queue_count) { // If we don't have any dedicated transfer queues then use all dispatch // queues as transfer queues. device->transfer_queues[device->transfer_queue_count++] = queue; } + + if (maintenance_dispatch_queue == VK_NULL_HANDLE) { + maintenance_dispatch_queue = queue->handle(); + } + + if (iree_all_bits_set(enabled_features, + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) { + IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate( + device->physical_device, device->logical_device, queue->handle(), + queue_name, maintenance_dispatch_queue, device->dispatch_command_pool, + device->host_allocator, + &device->queue_tracing_contexts[queue_index])); + queue->set_tracing_context(device->queue_tracing_contexts[queue_index]); + } } for (iree_host_size_t i = 0; i < transfer_queue_count; ++i) { if (!(transfer_queue_set->queue_indices & (1ull << i))) continue; + + char queue_name_buffer[32]; + int queue_name_length = + snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer), + "Vulkan[%c:%d]", 'T', (int)device->transfer_queue_count); + iree_string_view_t queue_name = + iree_make_string_view(queue_name_buffer, queue_name_length); + CommandQueue* queue = iree_hal_vulkan_device_create_queue( device->logical_device, IREE_HAL_COMMAND_CATEGORY_TRANSFER, transfer_queue_set->queue_family_index, i, device->fence_pool); - device->queues[device->queue_count++] = queue; + + iree_host_size_t queue_index = device->queue_count++; + device->queues[queue_index] = queue; device->transfer_queues[device->transfer_queue_count++] = queue; + + if (iree_all_bits_set(enabled_features, + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) { + IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate( + device->physical_device, device->logical_device, queue->handle(), + queue_name, maintenance_dispatch_queue, device->dispatch_command_pool, + device->host_allocator, + &device->queue_tracing_contexts[queue_index])); + queue->set_tracing_context(device->queue_tracing_contexts[queue_index]); + } } + + return iree_ok_status(); } static iree_status_t iree_hal_vulkan_device_create_internal( iree_hal_driver_t* driver, iree_string_view_t identifier, + iree_hal_vulkan_features_t enabled_features, const iree_hal_vulkan_device_options_t* options, VkInstance instance, VkPhysicalDevice physical_device, VkDeviceHandle* logical_device, const iree_hal_vulkan_device_extensions_t* device_extensions, @@ -474,7 +543,8 @@ static iree_status_t iree_hal_vulkan_device_create_internal( sizeof(*device) + identifier.size + total_queue_count * sizeof(device->queues[0]) + total_queue_count * sizeof(device->dispatch_queues[0]) + - total_queue_count * sizeof(device->transfer_queues[0]); + total_queue_count * sizeof(device->transfer_queues[0]) + + total_queue_count * sizeof(device->queue_tracing_contexts[0]); IREE_RETURN_IF_ERROR( iree_allocator_malloc(host_allocator, total_size, (void**)&device)); memset(device, 0, total_size); @@ -502,6 +572,9 @@ static iree_status_t iree_hal_vulkan_device_create_internal( buffer_ptr += total_queue_count * sizeof(device->dispatch_queues[0]); device->transfer_queues = (CommandQueue**)buffer_ptr; buffer_ptr += total_queue_count * sizeof(device->transfer_queues[0]); + device->queue_tracing_contexts = + (iree_hal_vulkan_tracing_context_t**)buffer_ptr; + buffer_ptr += total_queue_count * sizeof(device->queue_tracing_contexts[0]); device->descriptor_pool_cache = new DescriptorPoolCache(device->logical_device); @@ -550,8 +623,9 @@ static iree_status_t iree_hal_vulkan_device_create_internal( // initialization; this happens last as the queues require the pools allocated // above. if (iree_status_is_ok(status)) { - iree_hal_vulkan_device_initialize_command_queues( - device, identifier, compute_queue_set, transfer_queue_set); + status = iree_hal_vulkan_device_initialize_command_queues( + device, enabled_features, identifier, compute_queue_set, + transfer_queue_set); } if (iree_status_is_ok(status)) { @@ -570,6 +644,7 @@ static void iree_hal_vulkan_device_destroy(iree_hal_device_t* base_device) { // Drop all command queues. These may wait until idle in their destructor. for (iree_host_size_t i = 0; i < device->queue_count; ++i) { delete device->queues[i]; + iree_hal_vulkan_tracing_context_free(device->queue_tracing_contexts[i]); } // Drop command pools now that we know there are no more outstanding command @@ -715,6 +790,16 @@ iree_status_t iree_hal_vulkan_device_create( semaphore_features.timelineSemaphore = VK_TRUE; } + VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset_features; + if (enabled_device_extensions.host_query_reset) { + memset(&host_query_reset_features, 0, sizeof(host_query_reset_features)); + host_query_reset_features.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT; + host_query_reset_features.pNext = features2.pNext; + features2.pNext = &host_query_reset_features; + host_query_reset_features.hostQueryReset = VK_TRUE; + } + auto logical_device = new VkDeviceHandle( instance_syms, enabled_device_extensions, /*owns_device=*/true, host_allocator, /*allocator=*/NULL); @@ -741,9 +826,9 @@ iree_status_t iree_hal_vulkan_device_create( // Allocate and initialize the device. if (iree_status_is_ok(status)) { status = iree_hal_vulkan_device_create_internal( - driver, identifier, options, instance, physical_device, logical_device, - &enabled_device_extensions, &compute_queue_set, &transfer_queue_set, - host_allocator, out_device); + driver, identifier, enabled_features, options, instance, + physical_device, logical_device, &enabled_device_extensions, + &compute_queue_set, &transfer_queue_set, host_allocator, out_device); } logical_device->ReleaseReference(); @@ -783,6 +868,11 @@ IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_vulkan_wrap_device( iree_hal_vulkan_device_extensions_t enabled_device_extensions = iree_hal_vulkan_infer_enabled_device_extensions(device_syms.get()); + iree_hal_vulkan_features_t enabled_features = 0; +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING; +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + // Wrap the provided VkDevice with a VkDeviceHandle for use within the HAL. auto logical_device_handle = new VkDeviceHandle( device_syms.get(), enabled_device_extensions, @@ -791,9 +881,9 @@ IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_vulkan_wrap_device( // Allocate and initialize the device. iree_status_t status = iree_hal_vulkan_device_create_internal( - /*driver=*/NULL, identifier, options, instance, physical_device, - logical_device_handle, &enabled_device_extensions, compute_queue_set, - transfer_queue_set, host_allocator, out_device); + /*driver=*/NULL, identifier, enabled_features, options, instance, + physical_device, logical_device_handle, &enabled_device_extensions, + compute_queue_set, transfer_queue_set, host_allocator, out_device); logical_device_handle->ReleaseReference(); return status; @@ -851,9 +941,18 @@ static iree_status_t iree_hal_vulkan_device_create_command_buffer( command_pool = device->dispatch_command_pool; } + // The tracing context is tied to a particular queue so we must select here + // even though ideally we'd do it during submission. This is informational + // only and if the user does provide a different queue affinity during + // submission it just means the commands will be attributed to the wrong + // queue. + CommandQueue* queue = iree_hal_vulkan_device_select_queue( + device, command_categories, queue_affinity); + return iree_hal_vulkan_direct_command_buffer_allocate( device->logical_device, command_pool, mode, command_categories, - queue_affinity, device->descriptor_pool_cache, out_command_buffer); + queue_affinity, queue->tracing_context(), device->descriptor_pool_cache, + out_command_buffer); } static iree_status_t iree_hal_vulkan_device_create_descriptor_set( @@ -956,13 +1055,6 @@ static iree_status_t iree_hal_vulkan_device_wait_semaphores_with_timeout( static iree_status_t iree_hal_vulkan_device_wait_idle_with_deadline( iree_hal_device_t* base_device, iree_time_t deadline_ns) { iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device); - if (deadline_ns == IREE_TIME_INFINITE_FUTURE) { - // Fast path for using vkDeviceWaitIdle, which is usually cheaper (as it - // requires fewer calls into the driver). - return VK_RESULT_TO_STATUS(device->logical_device->syms()->vkDeviceWaitIdle( - *device->logical_device), - "vkDeviceWaitIdle"); - } for (iree_host_size_t i = 0; i < device->queue_count; ++i) { IREE_RETURN_IF_ERROR(device->queues[i]->WaitIdle(deadline_ns)); }