diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD index 5a21c011a647..f76bd3f9e1bf 100644 --- a/iree/hal/vulkan/BUILD +++ b/iree/hal/vulkan/BUILD @@ -72,6 +72,8 @@ cc_library( "status_util.h", "timepoint_util.cc", "timepoint_util.h", + "tracing.cc", + "tracing.h", "vma_allocator.cc", "vma_allocator.h", "vma_buffer.cc", diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt index ee165a9f7589..7a567dd9cb5f 100644 --- a/iree/hal/vulkan/CMakeLists.txt +++ b/iree/hal/vulkan/CMakeLists.txt @@ -61,6 +61,8 @@ iree_cc_library( "status_util.h" "timepoint_util.cc" "timepoint_util.h" + "tracing.cc" + "tracing.h" "vma_allocator.cc" "vma_allocator.h" "vma_buffer.cc" diff --git a/iree/hal/vulkan/api.h b/iree/hal/vulkan/api.h index 3f5912ae295e..35b1c2c8ad08 100644 --- a/iree/hal/vulkan/api.h +++ b/iree/hal/vulkan/api.h @@ -37,10 +37,20 @@ extern "C" { enum iree_hal_vulkan_feature_e { // Use VK_LAYER_KHRONOS_standard_validation to validate Vulkan API usage. // Has a significant performance penalty and is *not* a security mechanism. - IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS = 1 << 0, + IREE_HAL_VULKAN_FEATURE_ENABLE_VALIDATION_LAYERS = 1u << 0, // Use VK_EXT_debug_utils, record markers, and log errors. - IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS = 1 << 1, + IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS = 1u << 1, + + // Enables tracing of command buffers when IREE tracing is enabled. + // May take advantage of additional extensions for more accurate timing or + // hardware-specific performance counters. + // + // NOTE: tracing has a non-trivial overhead and will skew the timing of + // submissions and introduce false barriers between dispatches. Use this to + // identify slow dispatches and refine from there; be wary of whole-program + // tracing with this enabled. + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING = 1u << 2, }; typedef uint64_t iree_hal_vulkan_features_t; diff --git a/iree/hal/vulkan/command_queue.h b/iree/hal/vulkan/command_queue.h index 69486b838d82..ac0fff307580 100644 --- a/iree/hal/vulkan/command_queue.h +++ b/iree/hal/vulkan/command_queue.h @@ -22,6 +22,7 @@ #include "iree/hal/api.h" #include "iree/hal/vulkan/dynamic_symbols.h" #include "iree/hal/vulkan/handle_util.h" +#include "iree/hal/vulkan/tracing.h" #include "iree/hal/vulkan/util/arena.h" namespace iree { @@ -42,6 +43,15 @@ class CommandQueue { return logical_device_->syms(); } + VkQueue handle() const { return queue_; } + + iree_hal_vulkan_tracing_context_t* tracing_context() { + return tracing_context_; + } + void set_tracing_context(iree_hal_vulkan_tracing_context_t* tracing_context) { + tracing_context_ = tracing_context; + } + bool can_dispatch() const { return iree_all_bits_set(supported_categories_, IREE_HAL_COMMAND_CATEGORY_DISPATCH); @@ -52,19 +62,19 @@ class CommandQueue { virtual iree_status_t WaitIdle(iree_time_t deadline_ns) = 0; protected: - CommandQueue(VkDeviceHandle* logical_device, std::string name, + CommandQueue(VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue) : logical_device_(logical_device), - name_(std::move(name)), supported_categories_(supported_categories), queue_(queue) { iree_slim_mutex_initialize(&queue_mutex_); } VkDeviceHandle* logical_device_; - const std::string name_; const iree_hal_command_category_t supported_categories_; + iree_hal_vulkan_tracing_context_t* tracing_context_ = nullptr; + // VkQueue needs to be externally synchronized. iree_slim_mutex_t queue_mutex_; VkQueue queue_ IREE_GUARDED_BY(queue_mutex_); diff --git a/iree/hal/vulkan/direct_command_buffer.cc b/iree/hal/vulkan/direct_command_buffer.cc index ca16c8a1c53b..610eed040b80 100644 --- a/iree/hal/vulkan/direct_command_buffer.cc +++ b/iree/hal/vulkan/direct_command_buffer.cc @@ -36,6 +36,7 @@ typedef struct { iree_hal_command_buffer_mode_t mode; iree_hal_command_category_t allowed_categories; iree_hal_queue_affinity_t queue_affinity; + iree_hal_vulkan_tracing_context_t* tracing_context; VkCommandPoolHandle* command_pool; VkCommandBuffer handle; @@ -68,6 +69,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, + iree_hal_vulkan_tracing_context_t* tracing_context, iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache, iree_hal_command_buffer_t** out_command_buffer) { IREE_ASSERT_ARGUMENT(logical_device); @@ -98,6 +100,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( command_buffer->mode = mode; command_buffer->allowed_categories = command_categories; command_buffer->queue_affinity = queue_affinity; + command_buffer->tracing_context = tracing_context; command_buffer->command_pool = command_pool; command_buffer->handle = handle; command_buffer->syms = logical_device->syms().get(); @@ -564,6 +567,15 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch( iree_hal_vulkan_direct_command_buffer_t* command_buffer = iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer); + iree_hal_vulkan_source_location_t source_location; + iree_hal_vulkan_native_executable_entry_point_source_location( + executable, entry_point, &source_location); + IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( + command_buffer->tracing_context, command_buffer->handle, + source_location.file_name.data, source_location.file_name.size, + source_location.line, source_location.func_name.data, + source_location.func_name.size, NULL, 0); + // Get the compiled and linked pipeline for the specified entry point and // bind it to the command buffer. VkPipeline pipeline_handle = VK_NULL_HANDLE; @@ -576,6 +588,9 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch( command_buffer->syms->vkCmdDispatch(command_buffer->handle, workgroup_x, workgroup_y, workgroup_z); + IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context, + command_buffer->handle); + return iree_ok_status(); } @@ -587,6 +602,15 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch_indirect( iree_hal_vulkan_direct_command_buffer_t* command_buffer = iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer); + iree_hal_vulkan_source_location_t source_location; + iree_hal_vulkan_native_executable_entry_point_source_location( + executable, entry_point, &source_location); + IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( + command_buffer->tracing_context, command_buffer->handle, + source_location.file_name.data, source_location.file_name.size, + source_location.line, source_location.func_name.data, + source_location.func_name.size, NULL, 0); + // Get the compiled and linked pipeline for the specified entry point and // bind it to the command buffer. VkPipeline pipeline_handle = VK_NULL_HANDLE; @@ -602,6 +626,9 @@ static iree_status_t iree_hal_vulkan_direct_command_buffer_dispatch_indirect( command_buffer->syms->vkCmdDispatchIndirect( command_buffer->handle, workgroups_device_buffer, workgroups_offset); + IREE_VULKAN_TRACE_ZONE_END(command_buffer->tracing_context, + command_buffer->handle); + return iree_ok_status(); } diff --git a/iree/hal/vulkan/direct_command_buffer.h b/iree/hal/vulkan/direct_command_buffer.h index 858b521e97a4..18ef97939e66 100644 --- a/iree/hal/vulkan/direct_command_buffer.h +++ b/iree/hal/vulkan/direct_command_buffer.h @@ -18,6 +18,7 @@ #include "iree/hal/api.h" #include "iree/hal/vulkan/descriptor_pool_cache.h" #include "iree/hal/vulkan/handle_util.h" +#include "iree/hal/vulkan/tracing.h" #ifdef __cplusplus extern "C" { @@ -30,6 +31,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_hal_queue_affinity_t queue_affinity, + iree_hal_vulkan_tracing_context_t* tracing_context, iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache, iree_hal_command_buffer_t** out_command_buffer); diff --git a/iree/hal/vulkan/direct_command_queue.cc b/iree/hal/vulkan/direct_command_queue.cc index 461ce9aa59ab..4b78f4dbc7da 100644 --- a/iree/hal/vulkan/direct_command_queue.cc +++ b/iree/hal/vulkan/direct_command_queue.cc @@ -26,10 +26,9 @@ namespace hal { namespace vulkan { DirectCommandQueue::DirectCommandQueue( - VkDeviceHandle* logical_device, std::string name, + VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue) - : CommandQueue(logical_device, std::move(name), supported_categories, - queue) {} + : CommandQueue(logical_device, supported_categories, queue) {} DirectCommandQueue::~DirectCommandQueue() = default; @@ -134,6 +133,7 @@ iree_status_t DirectCommandQueue::WaitIdle(iree_time_t deadline_ns) { iree_status_t status = VK_RESULT_TO_STATUS(syms()->vkQueueWaitIdle(queue_), "vkQueueWaitIdle"); iree_slim_mutex_unlock(&queue_mutex_); + iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE); return status; } @@ -191,6 +191,8 @@ iree_status_t DirectCommandQueue::WaitIdle(iree_time_t deadline_ns) { syms()->vkDestroyFence(*logical_device_, fence, logical_device_->allocator()); + iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE); + return status; } diff --git a/iree/hal/vulkan/direct_command_queue.h b/iree/hal/vulkan/direct_command_queue.h index ad36aadcc58b..fc770ed1fe5f 100644 --- a/iree/hal/vulkan/direct_command_queue.h +++ b/iree/hal/vulkan/direct_command_queue.h @@ -25,7 +25,7 @@ namespace vulkan { // Command queue implementation directly maps to VkQueue. class DirectCommandQueue final : public CommandQueue { public: - DirectCommandQueue(VkDeviceHandle* logical_device, std::string name, + DirectCommandQueue(VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue); ~DirectCommandQueue() override; diff --git a/iree/hal/vulkan/dynamic_symbol_tables.h b/iree/hal/vulkan/dynamic_symbol_tables.h index 05dcd591a1ea..a6a5282069e3 100644 --- a/iree/hal/vulkan/dynamic_symbol_tables.h +++ b/iree/hal/vulkan/dynamic_symbol_tables.h @@ -113,7 +113,7 @@ namespace vulkan { DEV_PFN(EXCLUDED, vkCmdPushDescriptorSetWithTemplateKHR) \ DEV_PFN(EXCLUDED, vkCmdReserveSpaceForCommandsNVX) \ DEV_PFN(REQUIRED, vkCmdResetEvent) \ - DEV_PFN(EXCLUDED, vkCmdResetQueryPool) \ + DEV_PFN(REQUIRED, vkCmdResetQueryPool) \ DEV_PFN(EXCLUDED, vkCmdResolveImage) \ DEV_PFN(EXCLUDED, vkCmdSetBlendConstants) \ DEV_PFN(EXCLUDED, vkCmdSetCheckpointNV) \ @@ -174,7 +174,7 @@ namespace vulkan { DEV_PFN(EXCLUDED, vkCreateObjectTableNVX) \ DEV_PFN(REQUIRED, vkCreatePipelineCache) \ DEV_PFN(REQUIRED, vkCreatePipelineLayout) \ - DEV_PFN(EXCLUDED, vkCreateQueryPool) \ + DEV_PFN(REQUIRED, vkCreateQueryPool) \ DEV_PFN(EXCLUDED, vkCreateRayTracingPipelinesNV) \ DEV_PFN(EXCLUDED, vkCreateRenderPass) \ DEV_PFN(EXCLUDED, vkCreateRenderPass2KHR) \ @@ -207,7 +207,7 @@ namespace vulkan { DEV_PFN(REQUIRED, vkDestroyPipeline) \ DEV_PFN(REQUIRED, vkDestroyPipelineCache) \ DEV_PFN(REQUIRED, vkDestroyPipelineLayout) \ - DEV_PFN(EXCLUDED, vkDestroyQueryPool) \ + DEV_PFN(REQUIRED, vkDestroyQueryPool) \ DEV_PFN(EXCLUDED, vkDestroyRenderPass) \ DEV_PFN(EXCLUDED, vkDestroySampler) \ DEV_PFN(EXCLUDED, vkDestroySamplerYcbcrConversion) \ @@ -228,7 +228,7 @@ namespace vulkan { DEV_PFN(REQUIRED, vkGetBufferMemoryRequirements) \ DEV_PFN(EXCLUDED, vkGetBufferMemoryRequirements2) \ DEV_PFN(EXCLUDED, vkGetBufferMemoryRequirements2KHR) \ - DEV_PFN(EXCLUDED, vkGetCalibratedTimestampsEXT) \ + DEV_PFN(OPTIONAL, vkGetCalibratedTimestampsEXT) \ DEV_PFN(EXCLUDED, vkGetDescriptorSetLayoutSupport) \ DEV_PFN(EXCLUDED, vkGetDescriptorSetLayoutSupportKHR) \ DEV_PFN(EXCLUDED, vkGetDeviceGroupPeerMemoryFeatures) \ @@ -255,7 +255,7 @@ namespace vulkan { DEV_PFN(EXCLUDED, vkGetMemoryHostPointerPropertiesEXT) \ DEV_PFN(EXCLUDED, vkGetPastPresentationTimingGOOGLE) \ DEV_PFN(REQUIRED, vkGetPipelineCacheData) \ - DEV_PFN(EXCLUDED, vkGetQueryPoolResults) \ + DEV_PFN(REQUIRED, vkGetQueryPoolResults) \ DEV_PFN(EXCLUDED, vkGetRayTracingShaderGroupHandlesNV) \ DEV_PFN(EXCLUDED, vkGetRefreshCycleDurationGOOGLE) \ DEV_PFN(EXCLUDED, vkGetRenderAreaGranularity) \ @@ -278,7 +278,8 @@ namespace vulkan { DEV_PFN(REQUIRED, vkResetDescriptorPool) \ DEV_PFN(REQUIRED, vkResetEvent) \ DEV_PFN(REQUIRED, vkResetFences) \ - DEV_PFN(EXCLUDED, vkResetQueryPoolEXT) \ + DEV_PFN(OPTIONAL, vkResetQueryPool) \ + DEV_PFN(OPTIONAL, vkResetQueryPoolEXT) \ DEV_PFN(OPTIONAL, vkSetDebugUtilsObjectNameEXT) \ DEV_PFN(OPTIONAL, vkSetDebugUtilsObjectTagEXT) \ DEV_PFN(REQUIRED, vkSetEvent) \ @@ -322,7 +323,7 @@ namespace vulkan { INS_PFN(EXCLUDED, vkGetDisplayPlaneCapabilities2KHR) \ INS_PFN(EXCLUDED, vkGetDisplayPlaneCapabilitiesKHR) \ INS_PFN(EXCLUDED, vkGetDisplayPlaneSupportedDisplaysKHR) \ - INS_PFN(EXCLUDED, vkGetPhysicalDeviceCalibrateableTimeDomainsEXT) \ + INS_PFN(OPTIONAL, vkGetPhysicalDeviceCalibrateableTimeDomainsEXT) \ INS_PFN(EXCLUDED, vkGetPhysicalDeviceCooperativeMatrixPropertiesNV) \ INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayPlaneProperties2KHR) \ INS_PFN(EXCLUDED, vkGetPhysicalDeviceDisplayPlanePropertiesKHR) \ diff --git a/iree/hal/vulkan/extensibility_util.cc b/iree/hal/vulkan/extensibility_util.cc index 7320cd34c225..0bd9805976ee 100644 --- a/iree/hal/vulkan/extensibility_util.cc +++ b/iree/hal/vulkan/extensibility_util.cc @@ -206,6 +206,12 @@ iree_hal_vulkan_populate_enabled_device_extensions( } else if (strcmp(extension_name, VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME) == 0) { extensions.timeline_semaphore = true; + } else if (strcmp(extension_name, VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME) == + 0) { + extensions.host_query_reset = true; + } else if (strcmp(extension_name, + VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0) { + extensions.calibrated_timestamps = true; } } return extensions; @@ -222,5 +228,11 @@ iree_hal_vulkan_infer_enabled_device_extensions( if (device_syms->vkSignalSemaphore || device_syms->vkSignalSemaphoreKHR) { extensions.timeline_semaphore = true; } + if (device_syms->vkResetQueryPoolEXT) { + extensions.host_query_reset = true; + } + if (device_syms->vkGetCalibratedTimestampsEXT) { + extensions.calibrated_timestamps = true; + } return extensions; } diff --git a/iree/hal/vulkan/extensibility_util.h b/iree/hal/vulkan/extensibility_util.h index 3468794debbc..7f398f1263a0 100644 --- a/iree/hal/vulkan/extensibility_util.h +++ b/iree/hal/vulkan/extensibility_util.h @@ -83,6 +83,10 @@ typedef struct { bool push_descriptors : 1; // VK_KHR_timeline_semaphore is enabled. bool timeline_semaphore : 1; + // VK_EXT_host_query_reset is enabled. + bool host_query_reset : 1; + // VK_EXT_calibrated_timestamps is enabled. + bool calibrated_timestamps : 1; } iree_hal_vulkan_device_extensions_t; // Returns a bitfield with all of the provided extension names. diff --git a/iree/hal/vulkan/native_executable.cc b/iree/hal/vulkan/native_executable.cc index 9cd7ae05ce72..0b93ad3792c8 100644 --- a/iree/hal/vulkan/native_executable.cc +++ b/iree/hal/vulkan/native_executable.cc @@ -26,6 +26,11 @@ using namespace iree::hal::vulkan; +typedef struct { + VkPipeline pipeline; + iree_string_view_t name; +} iree_hal_vulkan_entry_point_t; + static iree_status_t iree_hal_vulkan_create_shader_module( VkDeviceHandle* logical_device, iree_const_byte_span_t code, VkShaderModule* out_shader_module) { @@ -55,7 +60,8 @@ static iree_status_t iree_hal_vulkan_create_pipelines( iree_SpirVExecutableDef_table_t executable_def, VkShaderModule shader_module, iree_host_size_t executable_layout_count, iree_hal_executable_layout_t* const* executable_layouts, - iree_host_size_t pipeline_count, VkPipeline* out_pipelines) { + iree_host_size_t pipeline_count, + iree_hal_vulkan_entry_point_t* out_entry_points) { VkComputePipelineCreateInfo* create_infos = NULL; IREE_RETURN_IF_ERROR(iree_allocator_malloc( logical_device->host_allocator(), @@ -96,11 +102,18 @@ static iree_status_t iree_hal_vulkan_create_pipelines( stage_create_info->pSpecializationInfo = NULL; } + VkPipeline* pipelines = + (VkPipeline*)iree_alloca(pipeline_count * sizeof(VkPipeline)); iree_status_t status = VK_RESULT_TO_STATUS( logical_device->syms()->vkCreateComputePipelines( *logical_device, pipeline_cache, (uint32_t)pipeline_count, - create_infos, logical_device->allocator(), out_pipelines), + create_infos, logical_device->allocator(), pipelines), "vkCreateComputePipelines"); + if (iree_status_is_ok(status)) { + for (iree_host_size_t i = 0; i < pipeline_count; ++i) { + out_entry_points[i].pipeline = pipelines[i]; + } + } iree_allocator_free(logical_device->host_allocator(), create_infos); return status; @@ -179,8 +192,8 @@ static iree_status_t iree_hal_spirv_executable_flatbuffer_verify( typedef struct { iree_hal_resource_t resource; VkDeviceHandle* logical_device; - iree_host_size_t pipeline_count; - VkPipeline pipelines[]; + iree_host_size_t entry_point_count; + iree_hal_vulkan_entry_point_t entry_points[]; } iree_hal_vulkan_native_executable_t; extern const iree_hal_executable_vtable_t @@ -226,31 +239,43 @@ iree_status_t iree_hal_vulkan_native_executable_create( // Create pipelines for each entry point. flatbuffers_string_vec_t entry_points_vec = iree_SpirVExecutableDef_entry_points_get(executable_def); - iree_host_size_t pipeline_count = + iree_host_size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); iree_hal_vulkan_native_executable_t* executable = NULL; iree_host_size_t total_size = - sizeof(*executable) + pipeline_count * sizeof(*executable->pipelines); + sizeof(*executable) + + entry_point_count * sizeof(*executable->entry_points); iree_status_t status = iree_allocator_malloc(logical_device->host_allocator(), total_size, (void**)&executable); if (iree_status_is_ok(status)) { iree_hal_resource_initialize(&iree_hal_vulkan_native_executable_vtable, &executable->resource); executable->logical_device = logical_device; - executable->pipeline_count = pipeline_count; - memset(executable->pipelines, 0, - pipeline_count * sizeof(*executable->pipelines)); + executable->entry_point_count = entry_point_count; + memset(executable->entry_points, 0, + entry_point_count * sizeof(*executable->entry_points)); } if (iree_status_is_ok(status)) { status = iree_hal_vulkan_create_pipelines( logical_device, pipeline_cache, executable_spec->caching_mode, executable_def, shader_module, executable_spec->executable_layout_count, - executable_spec->executable_layouts, executable->pipeline_count, - executable->pipelines); + executable_spec->executable_layouts, executable->entry_point_count, + executable->entry_points); } iree_hal_vulkan_destroy_shader_module(logical_device, shader_module); + if (iree_status_is_ok(status)) { + flatbuffers_string_vec_t entry_points_vec = + iree_SpirVExecutableDef_entry_points_get(executable_def); + for (iree_host_size_t i = 0; i < entry_point_count; ++i) { + flatbuffers_string_t name = + flatbuffers_string_vec_at(entry_points_vec, i); + executable->entry_points[i].name = + iree_make_string_view(name, flatbuffers_string_len(name)); + } + } + if (iree_status_is_ok(status)) { *out_executable = (iree_hal_executable_t*)executable; } else { @@ -269,25 +294,41 @@ static void iree_hal_vulkan_native_executable_destroy( executable->logical_device->host_allocator(); IREE_TRACE_ZONE_BEGIN(z0); - for (iree_host_size_t i = 0; i < executable->pipeline_count; ++i) { + for (iree_host_size_t i = 0; i < executable->entry_point_count; ++i) { iree_hal_vulkan_destroy_pipeline(executable->logical_device, - executable->pipelines[i]); + executable->entry_points[i].pipeline); } iree_allocator_free(host_allocator, executable); IREE_TRACE_ZONE_END(z0); } +void iree_hal_vulkan_native_executable_entry_point_source_location( + iree_hal_executable_t* base_executable, iree_host_size_t entry_ordinal, + iree_hal_vulkan_source_location_t* out_source_location) { + iree_hal_vulkan_native_executable_t* executable = + iree_hal_vulkan_native_executable_cast(base_executable); + memset(out_source_location, 0, sizeof(*out_source_location)); + if (entry_ordinal >= executable->entry_point_count) { + return; + } + out_source_location->func_name = executable->entry_points[entry_ordinal].name; + + // TODO(benvanik): plumb through file name/line for the MLIR function. + out_source_location->file_name = out_source_location->func_name; + out_source_location->line = 0; +} + iree_status_t iree_hal_vulkan_native_executable_pipeline_for_entry_point( iree_hal_executable_t* base_executable, iree_host_size_t entry_ordinal, VkPipeline* out_pipeline_handle) { iree_hal_vulkan_native_executable_t* executable = iree_hal_vulkan_native_executable_cast(base_executable); - if (entry_ordinal >= executable->pipeline_count) { + if (entry_ordinal >= executable->entry_point_count) { return iree_make_status(IREE_STATUS_OUT_OF_RANGE, "invalid entry point ordinal %zu", entry_ordinal); } - *out_pipeline_handle = executable->pipelines[entry_ordinal]; + *out_pipeline_handle = executable->entry_points[entry_ordinal].pipeline; return iree_ok_status(); } diff --git a/iree/hal/vulkan/native_executable.h b/iree/hal/vulkan/native_executable.h index 0f7eed7df0c8..ab1222315c67 100644 --- a/iree/hal/vulkan/native_executable.h +++ b/iree/hal/vulkan/native_executable.h @@ -26,6 +26,12 @@ extern "C" { #endif // __cplusplus +typedef struct { + iree_string_view_t file_name; + int line; + iree_string_view_t func_name; +} iree_hal_vulkan_source_location_t; + // Creates a wrapper for one or more VkPipelines that are sourced from the same // IREE executable. Each of the pipelines will share the same shader module // and just differs by the entry point into the shader module they reference. @@ -35,6 +41,12 @@ iree_status_t iree_hal_vulkan_native_executable_create( const iree_hal_executable_spec_t* executable_spec, iree_hal_executable_t** out_executable); +// Returns the source location for the given entry point. May be empty if not +// available. +void iree_hal_vulkan_native_executable_entry_point_source_location( + iree_hal_executable_t* executable, iree_host_size_t entry_ordinal, + iree_hal_vulkan_source_location_t* out_source_location); + // Returns the cached VkPipeline for the given executable |entry_ordinal|. iree_status_t iree_hal_vulkan_native_executable_pipeline_for_entry_point( iree_hal_executable_t* executable, iree_host_size_t entry_ordinal, diff --git a/iree/hal/vulkan/registration/driver_module.cc b/iree/hal/vulkan/registration/driver_module.cc index 76c228fa2361..761943b3fecb 100644 --- a/iree/hal/vulkan/registration/driver_module.cc +++ b/iree/hal/vulkan/registration/driver_module.cc @@ -35,6 +35,9 @@ ABSL_FLAG(int, vulkan_default_index, 0, "Index of the default Vulkan device."); ABSL_FLAG(bool, vulkan_force_timeline_semaphore_emulation, false, "Uses timeline semaphore emulation even if native support exists."); +ABSL_FLAG(bool, vulkan_tracing, true, + "Enables Vulkan tracing (if IREE tracing is enabled)."); + static iree_status_t iree_hal_vulkan_create_driver_with_flags( iree_string_view_t identifier, iree_allocator_t allocator, iree_hal_driver_t** out_driver) { @@ -63,6 +66,9 @@ static iree_status_t iree_hal_vulkan_create_driver_with_flags( driver_options.requested_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_DEBUG_UTILS; } + if (absl::GetFlag(FLAGS_vulkan_tracing)) { + driver_options.requested_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING; + } driver_options.default_device_index = absl::GetFlag(FLAGS_vulkan_default_index); diff --git a/iree/hal/vulkan/serializing_command_queue.cc b/iree/hal/vulkan/serializing_command_queue.cc index 7b6732c4ce24..f03888529417 100644 --- a/iree/hal/vulkan/serializing_command_queue.cc +++ b/iree/hal/vulkan/serializing_command_queue.cc @@ -161,11 +161,10 @@ void PrepareSubmitInfo(absl::Span wait_semaphore_handles, } // namespace SerializingCommandQueue::SerializingCommandQueue( - VkDeviceHandle* logical_device, std::string name, + VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue, TimePointFencePool* fence_pool) - : CommandQueue(logical_device, std::move(name), supported_categories, - queue), + : CommandQueue(logical_device, supported_categories, queue), fence_pool_(fence_pool) {} SerializingCommandQueue::~SerializingCommandQueue() = default; @@ -314,6 +313,8 @@ iree_status_t SerializingCommandQueue::WaitIdle(iree_time_t deadline_ns) { } iree_slim_mutex_unlock(&queue_mutex_); + + iree_hal_vulkan_tracing_context_collect(tracing_context(), VK_NULL_HANDLE); return status; } diff --git a/iree/hal/vulkan/serializing_command_queue.h b/iree/hal/vulkan/serializing_command_queue.h index 3137a1d15d47..9de2c0a8d21c 100644 --- a/iree/hal/vulkan/serializing_command_queue.h +++ b/iree/hal/vulkan/serializing_command_queue.h @@ -52,7 +52,7 @@ using SemaphoreValue = std::pair; // the GPU. class SerializingCommandQueue final : public CommandQueue { public: - SerializingCommandQueue(VkDeviceHandle* logical_device, std::string name, + SerializingCommandQueue(VkDeviceHandle* logical_device, iree_hal_command_category_t supported_categories, VkQueue queue, TimePointFencePool* fence_pool); ~SerializingCommandQueue() override; diff --git a/iree/hal/vulkan/tracing.cc b/iree/hal/vulkan/tracing.cc new file mode 100644 index 000000000000..a5de314a5d41 --- /dev/null +++ b/iree/hal/vulkan/tracing.cc @@ -0,0 +1,648 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "iree/hal/vulkan/tracing.h" + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +#include "iree/base/api.h" +#include "iree/base/internal/debugging.h" +#include "iree/base/target_platform.h" +#include "third_party/tracy/Tracy.hpp" +#include "third_party/tracy/client/TracyProfiler.hpp" + +// Total number of queries the per-queue query pool will contain. This +// translates to the maximum number of outstanding queries before collection is +// required. +#define IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY (64 * 1024) + +// Total number of queries that can be read back from the API in a single +// collection. +#define IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY (8 * 1024) + +// Number of times we will query the max_deviation from calibrated timestamps. +// The more we do the better confidence we have in a lower-bound. +#define IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT 32 + +typedef struct { + uint64_t timestamp; + uint64_t availability; // non-zero if available +} iree_hal_vulkan_timestamp_query_t; + +struct iree_hal_vulkan_tracing_context_s { + // Device and queue the context represents. + iree::hal::vulkan::VkDeviceHandle* logical_device; + VkQueue queue; + iree_allocator_t host_allocator; + + // Maintenance queue that supports dispatch commands and can be used to reset + // queries. + VkQueue maintenance_dispatch_queue; + // Command pool that serves command buffers compatible with the + // |maintenance_dispatch_queue|. + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool; + + // A unique GPU zone ID allocated from Tracy. + // There is a global limit of 255 GPU zones (ID 255 is special). + uint8_t id; + + // Defines how the timestamps are interpreted (device-specific, posix, QPC). + // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkTimeDomainEXT.html + VkTimeDomainEXT time_domain; + + // Maximum expected deviation between CPU and GPU timestamps based on an + // average computed at startup. Calibration events that exceed this value are + // discarded. + uint64_t max_expected_deviation; + + // Vulkan-reported CPU timestamp of the last calibration. + // Used to detect when drift occurs and we need to notify tracy. + uint64_t previous_cpu_time; + + // Pool of query instances that we treat as a backing store for a ringbuffer. + VkQueryPool query_pool; + + // Indices into |query_pool| defining a ringbuffer. + uint32_t query_head; + uint32_t query_tail; + uint32_t query_capacity; + + // Readback storage; large enough to get a decent chunk of queries back from + // the API in one shot. + // + // Data is stored as [[timestamp, availability], ...]. + // Availability will be non-zero if the timestamp is valid. Since we put all + // timestamps in order once we reach an unavailable timestamp we can bail + // and leave that for future collections. + iree_hal_vulkan_timestamp_query_t + readback_buffer[IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY]; +}; + +// Allocates and begins a command buffer and returns its handle. +// Returns VK_NULL_HANDLE if allocation fails. +static VkCommandBuffer iree_hal_vulkan_tracing_begin_command_buffer( + iree_hal_vulkan_tracing_context_t* context) { + const auto& syms = context->logical_device->syms(); + + VkCommandBufferAllocateInfo command_buffer_info; + memset(&command_buffer_info, 0, sizeof(command_buffer_info)); + command_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + command_buffer_info.commandPool = *context->maintenance_command_pool; + command_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + command_buffer_info.commandBufferCount = 1; + VkCommandBuffer command_buffer = VK_NULL_HANDLE; + IREE_IGNORE_ERROR(context->maintenance_command_pool->Allocate( + &command_buffer_info, &command_buffer)); + if (!command_buffer) return VK_NULL_HANDLE; + + VkCommandBufferBeginInfo begin_info; + memset(&begin_info, 0, sizeof(begin_info)); + begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + syms->vkBeginCommandBuffer(command_buffer, &begin_info); + + return command_buffer; +} + +// Ends and submits |command_buffer| and waits for it to complete. +static void iree_hal_vulkan_tracing_submit_command_buffer( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) { + const auto& syms = context->logical_device->syms(); + + syms->vkEndCommandBuffer(command_buffer); + + VkSubmitInfo submit_info; + memset(&submit_info, 0, sizeof(submit_info)); + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &command_buffer; + syms->vkQueueSubmit(context->maintenance_dispatch_queue, 1, &submit_info, + VK_NULL_HANDLE); + syms->vkQueueWaitIdle(context->maintenance_dispatch_queue); + + context->maintenance_command_pool->Free(command_buffer); +} + +// Synchronously resets a range of querys in a query pool. +// This may submit commands to the queue. +static void iree_hal_vulkan_tracing_reset_query_pool( + iree_hal_vulkan_tracing_context_t* context, uint32_t query_index, + uint32_t query_count) { + const auto& syms = context->logical_device->syms(); + + // Fast-path for when host-side vkResetQueryPool is available. + // This is core in Vulkan 1.2. + if (context->logical_device->enabled_extensions().host_query_reset) { + PFN_vkResetQueryPool vkResetQueryPool_fn = syms->vkResetQueryPool + ? syms->vkResetQueryPool + : syms->vkResetQueryPoolEXT; + if (vkResetQueryPool_fn != NULL) { + vkResetQueryPool_fn(*context->logical_device, context->query_pool, + query_index, query_count); + return; + } + } + + // Slow-path submitting a command buffer to reset the query pool. It's obvious + // why vkResetQueryPool was added :) + VkCommandBuffer command_buffer = + iree_hal_vulkan_tracing_begin_command_buffer(context); + if (command_buffer != VK_NULL_HANDLE) { + syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_index, + query_count); + iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer); + } +} + +// Attempts to get a timestamp from both the CPU and GPU that are correlated +// with each other. Only valid when calibration is supported. +static void iree_hal_vulkan_tracing_query_calibration_timestamps( + iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time, + uint64_t* out_gpu_time) { + *out_cpu_time = 0; + *out_gpu_time = 0; + + VkCalibratedTimestampInfoEXT timestamp_infos[2]; + timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[0].pNext = NULL; + timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT; + timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[1].pNext = NULL; + timestamp_infos[1].timeDomain = context->time_domain; + uint64_t timestamps[2] = {0, 0}; + uint64_t max_deviation = 0; + do { + context->logical_device->syms()->vkGetCalibratedTimestampsEXT( + *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos, + timestamps, &max_deviation); + } while (max_deviation > context->max_expected_deviation); + + *out_gpu_time = timestamps[0]; + *out_cpu_time = timestamps[1]; + switch (context->time_domain) { +#if defined(IREE_PLATFORM_WINDOWS) + case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT: + *out_cpu_time *= (uint64_t)(1000000000.0 / tracy::GetFrequencyQpc()); + break; +#else + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT: + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT: + // TODO(benvanik): posix calibrated timestamps - ignored for now. + break; +#endif // IREE_PLATFORM_WINDOWS + } +} + +// Populates |out_cpu_time| and |out_gpu_time| with calibrated timestamps. +// Depending on whether VK_EXT_calibrated_timestamps is available this may be +// a guess done by ourselves (with lots of slop) or done by the driver (with +// less slop). +static void iree_hal_vulkan_tracing_perform_initial_calibration( + iree_hal_vulkan_tracing_context_t* context, uint64_t* out_cpu_time, + uint64_t* out_gpu_time) { + const auto& syms = context->logical_device->syms(); + *out_cpu_time = 0; + *out_gpu_time = 0; + + // Attempt to get a timestamp from both the device and the host at roughly the + // same time. There's a gap between when we get control returned to use after + // submitting and waiting for idle and that will be the slop we have in the + // timings in the tracy UI. + if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) { + // Submit a device timestamp. + VkCommandBuffer command_buffer = + iree_hal_vulkan_tracing_begin_command_buffer(context); + if (command_buffer != VK_NULL_HANDLE) { + syms->vkCmdWriteTimestamp(command_buffer, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + context->query_pool, 0); + iree_hal_vulkan_tracing_submit_command_buffer(context, command_buffer); + } + + // Query the timestamp from the host and the device. + *out_cpu_time = tracy::Profiler::GetTime(); + syms->vkGetQueryPoolResults( + *context->logical_device, context->query_pool, 0, 1, + sizeof(*out_gpu_time), out_gpu_time, sizeof(*out_gpu_time), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + + // Reset the query used. + iree_hal_vulkan_tracing_reset_query_pool(context, 0, 1); + return; + } + + // From the spec: + // The maximum deviation may vary between calls to + // vkGetCalibratedTimestampsEXT even for the same set of time domains due to + // implementation and platform specific reasons. It is the application’s + // responsibility to assess whether the returned maximum deviation makes the + // timestamp values suitable for any particular purpose and can choose to + // re-issue the timestamp calibration call pursuing a lower devation value. + // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/vkGetCalibratedTimestampsEXT.html + // + // We perform a small number of queries here and find the minimum deviation + // across all of them to get an average lower bound on the maximum deviation + // from any particular query. We then use that as our baseline (plus some + // slop) to see if calibration events in the future are reasonable. + VkCalibratedTimestampInfoEXT timestamp_infos[2]; + timestamp_infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[0].pNext = NULL; + timestamp_infos[0].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT; + timestamp_infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT; + timestamp_infos[1].pNext = NULL; + timestamp_infos[1].timeDomain = context->time_domain; + uint64_t max_deviations[IREE_HAL_VULKAN_TRACING_MAX_DEVIATION_PROBE_COUNT]; + for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(max_deviations); ++i) { + uint64_t timestamps[2] = {0, 0}; + syms->vkGetCalibratedTimestampsEXT( + *context->logical_device, IREE_ARRAYSIZE(timestamps), timestamp_infos, + timestamps, &max_deviations[i]); + } + uint64_t min_deviation = max_deviations[0]; + for (iree_host_size_t i = 1; i < IREE_ARRAYSIZE(max_deviations); ++i) { + min_deviation = iree_min(min_deviation, max_deviations[i]); + } + context->max_expected_deviation = min_deviation * 3 / 2; + + iree_hal_vulkan_tracing_query_calibration_timestamps( + context, &context->previous_cpu_time, out_gpu_time); + *out_cpu_time = tracy::Profiler::GetTime(); +} + +// Performs a periodic calibration (if supported) and sends the data to tracy. +// Over time the host and device clocks may drift (especially with power events) +// and by frequently performing this we ensure that the samples we are sending +// to tracy are able to be correlated. +void iree_hal_vulkan_tracing_perform_calibration( + iree_hal_vulkan_tracing_context_t* context) { + if (context->time_domain == VK_TIME_DOMAIN_DEVICE_EXT) return; + + uint64_t cpu_time = 0; + uint64_t gpu_time = 0; + iree_hal_vulkan_tracing_query_calibration_timestamps(context, &cpu_time, + &gpu_time); + + uint64_t tracy_time = tracy::Profiler::GetTime(); + if (cpu_time > context->previous_cpu_time) { + uint64_t cpu_delta = cpu_time - context->previous_cpu_time; + context->previous_cpu_time = cpu_time; + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuCalibration); + tracy::MemWrite(&item->gpuCalibration.gpuTime, gpu_time); + tracy::MemWrite(&item->gpuCalibration.cpuTime, tracy_time); + tracy::MemWrite(&item->gpuCalibration.cpuDelta, cpu_delta); + tracy::MemWrite(&item->gpuCalibration.context, context->id); + tracy::Profiler::QueueSerialFinish(); + } +} + +// Prepares the VkQueryPool backing storage for our query ringbuffer. +static void iree_hal_vulkan_tracing_prepare_query_pool( + iree_hal_vulkan_tracing_context_t* context) { + // Create a query pool with the largest query capacity it can provide. + VkQueryPoolCreateInfo pool_info; + memset(&pool_info, 0, sizeof(pool_info)); + pool_info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + pool_info.queryCount = IREE_HAL_VULKAN_TRACING_DEFAULT_QUERY_CAPACITY; + pool_info.queryType = VK_QUERY_TYPE_TIMESTAMP; + while (context->logical_device->syms()->vkCreateQueryPool( + *context->logical_device, &pool_info, + context->logical_device->allocator(), + &context->query_pool) != VK_SUCCESS) { + pool_info.queryCount /= 2; + } + context->query_capacity = pool_info.queryCount; + + // Perform initial reset of the query pool. All queries must be reset upon + // creation before first use. + iree_hal_vulkan_tracing_reset_query_pool(context, 0, context->query_capacity); +} + +// Prepares the Tracy-related GPU context that events are fed into. Each context +// will appear as a unique plot in the tracy UI with the given |queue_name|. +static void iree_hal_vulkan_tracing_prepare_gpu_context( + iree_hal_vulkan_tracing_context_t* context, + VkPhysicalDevice physical_device, iree_string_view_t queue_name) { + // Allocate the process-unique GPU context ID. There's a max of 255 available; + // if we are recreating devices a lot we may exceed that. Don't do that, or + // wrap around and get weird (but probably still usable) numbers. + context->id = + tracy::GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed); + if (context->id >= 255) { + context->id %= 255; + } + + // The number of nanoseconds required for a timestamp query to be incremented + // by 1. + VkPhysicalDeviceProperties device_properties; + context->logical_device->syms()->vkGetPhysicalDeviceProperties( + physical_device, &device_properties); + float timestamp_period = device_properties.limits.timestampPeriod; + + // Perform initial calibration for tracy to be able to correlate timestamps + // between CPU and GPU. + uint64_t cpu_time = 0; + uint64_t gpu_time = 0; + iree_hal_vulkan_tracing_perform_initial_calibration(context, &cpu_time, + &gpu_time); + + uint8_t context_flags = 0; + if (context->time_domain != VK_TIME_DOMAIN_DEVICE_EXT) { + // Tell tracy we'll be passing calibrated timestamps and not to mess with + // the times. We'll periodically send GpuCalibration events in case the + // times drift. + context_flags |= tracy::GpuContextCalibration; + } + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuNewContext); + tracy::MemWrite(&item->gpuNewContext.cpuTime, cpu_time); + tracy::MemWrite(&item->gpuNewContext.gpuTime, gpu_time); + memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); + tracy::MemWrite(&item->gpuNewContext.period, timestamp_period); + tracy::MemWrite(&item->gpuNewContext.context, context->id); + tracy::MemWrite(&item->gpuNewContext.flags, context_flags); + tracy::MemWrite(&item->gpuNewContext.type, tracy::GpuContextType::Vulkan); + tracy::Profiler::QueueSerialFinish(); + } + + // Send the name of the context along. + // NOTE: we intentionally leak the name here as tracy needs a pointer that + // survives until process exit (in case TRACY_NO_EXIT is set and the app waits + // in exit() for the profiler to attach). + IREE_LEAK_CHECK_DISABLE_PUSH(); + char* cloned_name = (char*)malloc(queue_name.size); + memcpy(cloned_name, queue_name.data, queue_name.size); + IREE_LEAK_CHECK_DISABLE_POP(); + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuContextName); + tracy::MemWrite(&item->gpuContextNameFat.context, context->id); + tracy::MemWrite(&item->gpuContextNameFat.ptr, (uint64_t)cloned_name); + tracy::MemWrite(&item->gpuContextNameFat.size, queue_name.size); + tracy::Profiler::QueueSerialFinish(); + } +} + +// Returns the best possible platform-supported time domain, falling back to +// VK_TIME_DOMAIN_DEVICE_EXT. By default it is one that is only usable for +// device-relative calculations and that we need to perform our own hacky +// calibration on. +static VkTimeDomainEXT iree_hal_vulkan_tracing_query_time_domain( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device) { + if (!logical_device->enabled_extensions().calibrated_timestamps) { + // Calibrated timestamps extension is not available; we'll only have the + // device domain. + return VK_TIME_DOMAIN_DEVICE_EXT; + } + + uint32_t time_domain_count = 0; + if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( + physical_device, &time_domain_count, NULL) != VK_SUCCESS) { + return VK_TIME_DOMAIN_DEVICE_EXT; + } + VkTimeDomainEXT* time_domains = (VkTimeDomainEXT*)iree_alloca( + time_domain_count * sizeof(VkTimeDomainEXT)); + if (logical_device->syms()->vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( + physical_device, &time_domain_count, time_domains) != VK_SUCCESS) { + return VK_TIME_DOMAIN_DEVICE_EXT; + } + + for (uint32_t i = 0; i < time_domain_count; i++) { + switch (time_domains[i]) { +#if defined(IREE_PLATFORM_WINDOWS) + case VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT: + return time_domains[i]; +#else + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT: + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT: + // TODO(benvanik): support posix clock domains with some kind of math. + // return time_domains[i]; -- ignored +#endif // IREE_PLATFORM_WINDOWS + default: + continue; + } + } + return VK_TIME_DOMAIN_DEVICE_EXT; +} + +iree_status_t iree_hal_vulkan_tracing_context_allocate( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, + iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue, + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool, + iree_allocator_t host_allocator, + iree_hal_vulkan_tracing_context_t** out_context) { + IREE_TRACE_ZONE_BEGIN(z0); + IREE_ASSERT_ARGUMENT(logical_device); + IREE_ASSERT_ARGUMENT(out_context); + *out_context = NULL; + + iree_hal_vulkan_tracing_context_t* context = NULL; + iree_status_t status = + iree_allocator_malloc(host_allocator, sizeof(*context), (void**)&context); + if (iree_status_is_ok(status)) { + context->logical_device = logical_device; + context->queue = queue; + context->host_allocator = host_allocator; + context->time_domain = iree_hal_vulkan_tracing_query_time_domain( + physical_device, logical_device); + context->maintenance_dispatch_queue = maintenance_dispatch_queue; + context->maintenance_command_pool = maintenance_command_pool; + + // Prepare the query pool and perform the initial calibration. + iree_hal_vulkan_tracing_prepare_query_pool(context); + + // Prepare the Tracy GPU context. + iree_hal_vulkan_tracing_prepare_gpu_context(context, physical_device, + queue_name); + } + + if (iree_status_is_ok(status)) { + *out_context = context; + } else { + iree_hal_vulkan_tracing_context_free(context); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +void iree_hal_vulkan_tracing_context_free( + iree_hal_vulkan_tracing_context_t* context) { + if (!context) return; + IREE_TRACE_ZONE_BEGIN(z0); + + if (context->query_pool != VK_NULL_HANDLE) { + // Always perform a collection on shutdown. + iree_hal_vulkan_tracing_context_collect(context, VK_NULL_HANDLE); + + auto* logical_device = context->logical_device; + logical_device->syms()->vkDestroyQueryPool( + *logical_device, context->query_pool, logical_device->allocator()); + } + + iree_allocator_t host_allocator = context->host_allocator; + iree_allocator_free(host_allocator, context); + + IREE_TRACE_ZONE_END(z0); +} + +uint32_t iree_hal_vulkan_tracing_context_acquire_query_id( + iree_hal_vulkan_tracing_context_t* context) { + uint32_t id = context->query_head; + context->query_head = (context->query_head + 1) % context->query_capacity; + assert(context->query_head != context->query_tail); + return id; +} + +void iree_hal_vulkan_tracing_context_collect( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) { + if (!context) return; + if (context->query_tail == context->query_head) { + // No outstanding queries. + return; + } + IREE_TRACE_ZONE_BEGIN(z0); + const auto& syms = context->logical_device->syms(); + + while (context->query_tail != context->query_head) { + // Compute the contiguous range of queries ready to be read. + // If the ringbuffer wraps around we'll handle that in the next loop. + uint32_t try_query_count = + context->query_head < context->query_tail + ? context->query_capacity - context->query_tail + : context->query_head - context->query_tail; + try_query_count = iree_min(try_query_count, + IREE_HAL_VULKAN_TRACING_READBACK_QUERY_CAPACITY); + + // Read back all of the queries. Note that we also are reading back the + // availability such that we can handle partial readiness of the outstanding + // range of queries. + uint32_t query_base = context->query_tail; + if (syms->vkGetQueryPoolResults( + *context->logical_device, context->query_pool, query_base, + try_query_count, sizeof(context->readback_buffer), + context->readback_buffer, sizeof(iree_hal_vulkan_timestamp_query_t), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) != + VK_SUCCESS) { + break; + } + + // Scan and feed the times to tracy, stopping when we hit the first + // unavailable query. + uint32_t read_query_count = 0; + for (uint32_t i = 0; i < try_query_count; ++i) { + if (context->readback_buffer[i].availability == 0) break; + read_query_count = i + 1; + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuTime); + tracy::MemWrite(&item->gpuTime.gpuTime, + context->readback_buffer[i].timestamp); + tracy::MemWrite(&item->gpuTime.queryId, (uint16_t)(query_base + i)); + tracy::MemWrite(&item->gpuTime.context, context->id); + tracy::Profiler::QueueSerialFinish(); + } + + // Reset the range of queries read back. + if (command_buffer != VK_NULL_HANDLE) { + syms->vkCmdResetQueryPool(command_buffer, context->query_pool, query_base, + read_query_count); + } else { + iree_hal_vulkan_tracing_reset_query_pool(context, query_base, + read_query_count); + } + + context->query_tail += read_query_count; + if (context->query_tail >= context->query_capacity) { + context->query_tail = 0; + } + } + + // Run calibration - we could do this less frequently in cases where collect + // is called every submission, however it's relatively cheap compared to all + // this other tracing overhead. + iree_hal_vulkan_tracing_perform_calibration(context); + + IREE_TRACE_ZONE_END(z0); +} + +void iree_hal_vulkan_tracing_zone_begin_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const iree_tracing_location_t* src_loc) { + if (!context) return; + + uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); + context->logical_device->syms()->vkCmdWriteTimestamp( + command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, + query_id); + + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneBeginSerial); + tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime()); + tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc); + tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle()); + tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id); + tracy::MemWrite(&item->gpuZoneBegin.context, context->id); + tracy::Profiler::QueueSerialFinish(); +} + +void iree_hal_vulkan_tracing_zone_begin_external_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length) { + if (!context) return; + + uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); + context->logical_device->syms()->vkCmdWriteTimestamp( + command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, + query_id); + + const auto src_loc = tracy::Profiler::AllocSourceLocation( + line, file_name, file_name_length, function_name, function_name_length, + name, name_length); + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, + tracy::QueueType::GpuZoneBeginAllocSrcLocSerial); + tracy::MemWrite(&item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime()); + tracy::MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)src_loc); + tracy::MemWrite(&item->gpuZoneBegin.thread, tracy::GetThreadHandle()); + tracy::MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)query_id); + tracy::MemWrite(&item->gpuZoneBegin.context, context->id); + tracy::Profiler::QueueSerialFinish(); +} + +void iree_hal_vulkan_tracing_zone_end_impl( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) { + if (!context) return; + + uint32_t query_id = iree_hal_vulkan_tracing_context_acquire_query_id(context); + context->logical_device->syms()->vkCmdWriteTimestamp( + command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, context->query_pool, + query_id); + + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite(&item->hdr.type, tracy::QueueType::GpuZoneEndSerial); + tracy::MemWrite(&item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime()); + tracy::MemWrite(&item->gpuZoneEnd.thread, tracy::GetThreadHandle()); + tracy::MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)query_id); + tracy::MemWrite(&item->gpuZoneEnd.context, context->id); + tracy::Profiler::QueueSerialFinish(); +} + +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION diff --git a/iree/hal/vulkan/tracing.h b/iree/hal/vulkan/tracing.h new file mode 100644 index 000000000000..f637b702b1a3 --- /dev/null +++ b/iree/hal/vulkan/tracing.h @@ -0,0 +1,179 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef IREE_HAL_VULKAN_TRACING_H_ +#define IREE_HAL_VULKAN_TRACING_H_ + +// clang-format off: Must be included before all other headers: +#include "iree/hal/vulkan/vulkan_headers.h" +// clang-format on + +#include "iree/base/tracing.h" +#include "iree/hal/vulkan/handle_util.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Per-queue Vulkan tracing context. +// No-op if IREE tracing is not enabled. +// +// Use the IREE_VULKAN_TRACE_* macros to trace a contiguous set of command +// buffer operations. Unlike the normal tracy macros there are no zone IDs and +// instead each queue gets an ID allocated once and passed to all tracing +// macros. +// +// Usage: +// IREE_VULKAN_TRACE_ZONE_BEGIN(device->tracing_context, command_buffer); +// vkCmdDispatch(command_buffer, ...); +// IREE_VULKAN_TRACE_ZONE_END(queue->tracing_context, command_buffer); +// ... +// iree_hal_vulkan_tracing_context_collect(queue->tracing_context, +// command_buffer); +// vkQueueSubmit(...command_buffer...); +// +// NOTE: timestamps have non-trivial side-effecting behavior on the device: +// inserting a timestamp is in the worst (and average) case just as bad as +// inserting a full global execution barrier. If two command buffer operations +// that could overlap (no barrier between them) have tracing zones placed around +// them they will execute sequentially. +// +// TODO(benvanik): +// Each queue needs a context and maintains its own query pool. In the future +// this should be changed to have a single query pool per device to reduce +// bookkeeping overhead. +// +// TODO(benvanik): +// Both a zone begin and zone end always insert timestamps leading to N*2 +// total queries, however within command buffers the end of one zone and the +// begin of another share the same point in time. By inserting the timestamps +// at barriers in the command buffer the query count can be reduced to N+1. +// +// TODO(benvanik): +// vkCmdCopyQueryPoolResults is really what we should be using to do this - +// that inserts a device-side transfer to a buffer (conceptually) that is +// in-stream with all submissions to a queue. This changes things to a push +// model vs. the pull one in _collect and allows us to pipeline the readbacks. +// Instead of being limited to the query pool slots we'd only be limited by +// the size of the buffer the copy targets allowing us to perform collection +// much more infrequently. +// +// Thread-compatible: external synchronization is required if using from +// multiple threads (same as with VkQueue itself). +typedef struct iree_hal_vulkan_tracing_context_s + iree_hal_vulkan_tracing_context_t; + +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +// Allocates a tracing context for the given Vulkan queue. +// Each context must only be used with the queue it was created with. +// +// |maintenance_dispatch_queue| may be used to perform query pool maintenance +// tasks and must support graphics or compute commands. +iree_status_t iree_hal_vulkan_tracing_context_allocate( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, + iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue, + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool, + iree_allocator_t host_allocator, + iree_hal_vulkan_tracing_context_t** out_context); + +// Frees a tracing context and all associated Vulkan resources. +// All submissions using the resources must be completed prior to calling. +void iree_hal_vulkan_tracing_context_free( + iree_hal_vulkan_tracing_context_t* context); + +// Collects in-flight timestamp queries from the queue and feeds them to tracy. +// Must be called frequently (every submission, etc) to drain the backlog; +// tracing may start failing if the internal ringbuffer is exceeded. +// +// The provided |command_buffer| may receive additional bookkeeping commands +// that should have no impact on correctness or behavior. If VK_NULL_HANDLE is +// provided then collection will occur synchronously. +void iree_hal_vulkan_tracing_context_collect( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer); + +// Begins a normal zone derived on the calling |src_loc|. +// Must be perfectly nested and paired with a corresponding zone end. +void iree_hal_vulkan_tracing_zone_begin_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const iree_tracing_location_t* src_loc); + +// Begins an external zone using the given source information. +// The provided strings will be copied into the tracy buffer. +void iree_hal_vulkan_tracing_zone_begin_external_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length); + +void iree_hal_vulkan_tracing_zone_end_impl( + iree_hal_vulkan_tracing_context_t* context, VkCommandBuffer command_buffer); + +// Begins a new zone with the parent function name. +#define IREE_VULKAN_TRACE_ZONE_BEGIN(context, command_buffer) \ + static const iree_tracing_location_t TracyConcat( \ + __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__, \ + __FILE__, (uint32_t)__LINE__, 0}; \ + iree_hal_vulkan_tracing_zone_begin_impl( \ + context, command_buffer, \ + &TracyConcat(__tracy_source_location, __LINE__)); + +// Begins an externally defined zone with a dynamic source location. +// The |file_name|, |function_name|, and optional |name| strings will be copied +// into the trace buffer and do not need to persist. +#define IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, command_buffer, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) \ + iree_hal_vulkan_tracing_zone_begin_external_impl( \ + context, command_buffer, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) + +// Ends the current zone. Must be passed the |zone_id| from the _BEGIN. +#define IREE_VULKAN_TRACE_ZONE_END(context, command_buffer) \ + iree_hal_vulkan_tracing_zone_end_impl(context, command_buffer) + +#else + +inline iree_status_t iree_hal_vulkan_tracing_context_allocate( + VkPhysicalDevice physical_device, + iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, + iree_string_view_t queue_name, VkQueue maintenance_dispatch_queue, + iree::hal::vulkan::VkCommandPoolHandle* maintenance_command_pool, + iree_allocator_t host_allocator, + iree_hal_vulkan_tracing_context_t** out_context) { + *out_context = NULL; + return iree_ok_status(); +} + +inline void iree_hal_vulkan_tracing_context_free( + iree_hal_vulkan_tracing_context_t* context) {} + +inline void iree_hal_vulkan_tracing_context_collect( + iree_hal_vulkan_tracing_context_t* context, + VkCommandBuffer command_buffer) {} + +#define IREE_VULKAN_TRACE_ZONE_BEGIN(context, command_buffer) +#define IREE_VULKAN_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, command_buffer, file_name, file_name_length, line, function_name, \ + function_name_length, name, name_length) +#define IREE_VULKAN_TRACE_ZONE_END(context, command_buffer) + +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_VULKAN_TRACING_H_ diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc index 32f14b404321..9cf8b3ac9b6f 100644 --- a/iree/hal/vulkan/vulkan_device.cc +++ b/iree/hal/vulkan/vulkan_device.cc @@ -39,6 +39,7 @@ #include "iree/hal/vulkan/nop_executable_cache.h" #include "iree/hal/vulkan/serializing_command_queue.h" #include "iree/hal/vulkan/status_util.h" +#include "iree/hal/vulkan/tracing.h" #include "iree/hal/vulkan/vma_allocator.h" using namespace iree::hal::vulkan; @@ -135,6 +136,26 @@ iree_hal_vulkan_query_extensibility_set( VK_EXT_DEBUG_UTILS_EXTENSION_NAME); } +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + if (iree_all_bits_set(requested_features, + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) { + // VK_EXT_host_query_reset: + // optionally allows for vkResetQueryPool to be used to reset query pools + // from the host without needing to do an expensive vkCmdResetQueryPool + // submission. + ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL, + VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME); + + // VK_EXT_calibrated_timestamps: + // optionally provides more accurate timestamps that correspond to the + // system time. If this is not present then tracy will attempt calibration + // itself and have some per-run variance in the skew (up to many + // milliseconds). + ADD_EXT(IREE_HAL_VULKAN_EXTENSIBILITY_DEVICE_EXTENSIONS_OPTIONAL, + VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME); + } +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + *out_string_count = string_count; return status; } @@ -338,6 +359,9 @@ typedef struct { iree_host_size_t transfer_queue_count; CommandQueue** transfer_queues; + // |queue_count| tracing contexts, if tracing is enabled. + iree_hal_vulkan_tracing_context_t** queue_tracing_contexts; + DescriptorPoolCache* descriptor_pool_cache; VkCommandPoolHandle* dispatch_command_pool; @@ -395,65 +419,110 @@ static CommandQueue* iree_hal_vulkan_device_create_queue( VkQueue queue = VK_NULL_HANDLE; logical_device->syms()->vkGetDeviceQueue(*logical_device, queue_family_index, queue_index, &queue); - std::string queue_name; - if (!iree_all_bits_set(command_category, - IREE_HAL_COMMAND_CATEGORY_DISPATCH)) { - queue_name = "q(t):"; - } else { - queue_name = "q(d):"; - } - queue_name += std::to_string(queue_index); // When emulating timeline semaphores we use a special queue that allows us to // sequence the semaphores correctly. if (fence_pool != NULL) { - return new SerializingCommandQueue(logical_device, std::move(queue_name), - command_category, queue, fence_pool); + return new SerializingCommandQueue(logical_device, command_category, queue, + fence_pool); } - return new DirectCommandQueue(logical_device, std::move(queue_name), - command_category, queue); + return new DirectCommandQueue(logical_device, command_category, queue); } // Creates command queues for the given sets of queues and populates the // device queue lists. -static void iree_hal_vulkan_device_initialize_command_queues( - iree_hal_vulkan_device_t* device, iree_string_view_t queue_prefix, +static iree_status_t iree_hal_vulkan_device_initialize_command_queues( + iree_hal_vulkan_device_t* device, + iree_hal_vulkan_features_t enabled_features, + iree_string_view_t queue_prefix, const iree_hal_vulkan_queue_set_t* compute_queue_set, const iree_hal_vulkan_queue_set_t* transfer_queue_set) { device->queue_count = 0; device->dispatch_queue_count = 0; device->transfer_queue_count = 0; + // The first available queue supporting dispatch commands that will be used by + // the tracing subsystem for query and cleanup tasks. + VkQueue maintenance_dispatch_queue = VK_NULL_HANDLE; + uint64_t compute_queue_count = iree_math_count_ones_u64(compute_queue_set->queue_indices); uint64_t transfer_queue_count = iree_math_count_ones_u64(transfer_queue_set->queue_indices); for (iree_host_size_t i = 0; i < compute_queue_count; ++i) { if (!(compute_queue_set->queue_indices & (1ull << i))) continue; + + char queue_name_buffer[32]; + int queue_name_length = + snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer), + "Vulkan[%c:%d]", 'D', (int)device->dispatch_queue_count); + iree_string_view_t queue_name = + iree_make_string_view(queue_name_buffer, queue_name_length); + CommandQueue* queue = iree_hal_vulkan_device_create_queue( device->logical_device, IREE_HAL_COMMAND_CATEGORY_ANY, compute_queue_set->queue_family_index, i, device->fence_pool); - device->queues[device->queue_count++] = queue; + + iree_host_size_t queue_index = device->queue_count++; + device->queues[queue_index] = queue; device->dispatch_queues[device->dispatch_queue_count++] = queue; + if (!transfer_queue_count) { // If we don't have any dedicated transfer queues then use all dispatch // queues as transfer queues. device->transfer_queues[device->transfer_queue_count++] = queue; } + + if (maintenance_dispatch_queue == VK_NULL_HANDLE) { + maintenance_dispatch_queue = queue->handle(); + } + + if (iree_all_bits_set(enabled_features, + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) { + IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate( + device->physical_device, device->logical_device, queue->handle(), + queue_name, maintenance_dispatch_queue, device->dispatch_command_pool, + device->host_allocator, + &device->queue_tracing_contexts[queue_index])); + queue->set_tracing_context(device->queue_tracing_contexts[queue_index]); + } } for (iree_host_size_t i = 0; i < transfer_queue_count; ++i) { if (!(transfer_queue_set->queue_indices & (1ull << i))) continue; + + char queue_name_buffer[32]; + int queue_name_length = + snprintf(queue_name_buffer, IREE_ARRAYSIZE(queue_name_buffer), + "Vulkan[%c:%d]", 'T', (int)device->transfer_queue_count); + iree_string_view_t queue_name = + iree_make_string_view(queue_name_buffer, queue_name_length); + CommandQueue* queue = iree_hal_vulkan_device_create_queue( device->logical_device, IREE_HAL_COMMAND_CATEGORY_TRANSFER, transfer_queue_set->queue_family_index, i, device->fence_pool); - device->queues[device->queue_count++] = queue; + + iree_host_size_t queue_index = device->queue_count++; + device->queues[queue_index] = queue; device->transfer_queues[device->transfer_queue_count++] = queue; + + if (iree_all_bits_set(enabled_features, + IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING)) { + IREE_RETURN_IF_ERROR(iree_hal_vulkan_tracing_context_allocate( + device->physical_device, device->logical_device, queue->handle(), + queue_name, maintenance_dispatch_queue, device->dispatch_command_pool, + device->host_allocator, + &device->queue_tracing_contexts[queue_index])); + queue->set_tracing_context(device->queue_tracing_contexts[queue_index]); + } } + + return iree_ok_status(); } static iree_status_t iree_hal_vulkan_device_create_internal( iree_hal_driver_t* driver, iree_string_view_t identifier, + iree_hal_vulkan_features_t enabled_features, const iree_hal_vulkan_device_options_t* options, VkInstance instance, VkPhysicalDevice physical_device, VkDeviceHandle* logical_device, const iree_hal_vulkan_device_extensions_t* device_extensions, @@ -474,7 +543,8 @@ static iree_status_t iree_hal_vulkan_device_create_internal( sizeof(*device) + identifier.size + total_queue_count * sizeof(device->queues[0]) + total_queue_count * sizeof(device->dispatch_queues[0]) + - total_queue_count * sizeof(device->transfer_queues[0]); + total_queue_count * sizeof(device->transfer_queues[0]) + + total_queue_count * sizeof(device->queue_tracing_contexts[0]); IREE_RETURN_IF_ERROR( iree_allocator_malloc(host_allocator, total_size, (void**)&device)); memset(device, 0, total_size); @@ -502,6 +572,9 @@ static iree_status_t iree_hal_vulkan_device_create_internal( buffer_ptr += total_queue_count * sizeof(device->dispatch_queues[0]); device->transfer_queues = (CommandQueue**)buffer_ptr; buffer_ptr += total_queue_count * sizeof(device->transfer_queues[0]); + device->queue_tracing_contexts = + (iree_hal_vulkan_tracing_context_t**)buffer_ptr; + buffer_ptr += total_queue_count * sizeof(device->queue_tracing_contexts[0]); device->descriptor_pool_cache = new DescriptorPoolCache(device->logical_device); @@ -550,8 +623,9 @@ static iree_status_t iree_hal_vulkan_device_create_internal( // initialization; this happens last as the queues require the pools allocated // above. if (iree_status_is_ok(status)) { - iree_hal_vulkan_device_initialize_command_queues( - device, identifier, compute_queue_set, transfer_queue_set); + status = iree_hal_vulkan_device_initialize_command_queues( + device, enabled_features, identifier, compute_queue_set, + transfer_queue_set); } if (iree_status_is_ok(status)) { @@ -570,6 +644,7 @@ static void iree_hal_vulkan_device_destroy(iree_hal_device_t* base_device) { // Drop all command queues. These may wait until idle in their destructor. for (iree_host_size_t i = 0; i < device->queue_count; ++i) { delete device->queues[i]; + iree_hal_vulkan_tracing_context_free(device->queue_tracing_contexts[i]); } // Drop command pools now that we know there are no more outstanding command @@ -715,6 +790,16 @@ iree_status_t iree_hal_vulkan_device_create( semaphore_features.timelineSemaphore = VK_TRUE; } + VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset_features; + if (enabled_device_extensions.host_query_reset) { + memset(&host_query_reset_features, 0, sizeof(host_query_reset_features)); + host_query_reset_features.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT; + host_query_reset_features.pNext = features2.pNext; + features2.pNext = &host_query_reset_features; + host_query_reset_features.hostQueryReset = VK_TRUE; + } + auto logical_device = new VkDeviceHandle( instance_syms, enabled_device_extensions, /*owns_device=*/true, host_allocator, /*allocator=*/NULL); @@ -741,9 +826,9 @@ iree_status_t iree_hal_vulkan_device_create( // Allocate and initialize the device. if (iree_status_is_ok(status)) { status = iree_hal_vulkan_device_create_internal( - driver, identifier, options, instance, physical_device, logical_device, - &enabled_device_extensions, &compute_queue_set, &transfer_queue_set, - host_allocator, out_device); + driver, identifier, enabled_features, options, instance, + physical_device, logical_device, &enabled_device_extensions, + &compute_queue_set, &transfer_queue_set, host_allocator, out_device); } logical_device->ReleaseReference(); @@ -783,6 +868,11 @@ IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_vulkan_wrap_device( iree_hal_vulkan_device_extensions_t enabled_device_extensions = iree_hal_vulkan_infer_enabled_device_extensions(device_syms.get()); + iree_hal_vulkan_features_t enabled_features = 0; +#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + enabled_features |= IREE_HAL_VULKAN_FEATURE_ENABLE_TRACING; +#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION + // Wrap the provided VkDevice with a VkDeviceHandle for use within the HAL. auto logical_device_handle = new VkDeviceHandle( device_syms.get(), enabled_device_extensions, @@ -791,9 +881,9 @@ IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_vulkan_wrap_device( // Allocate and initialize the device. iree_status_t status = iree_hal_vulkan_device_create_internal( - /*driver=*/NULL, identifier, options, instance, physical_device, - logical_device_handle, &enabled_device_extensions, compute_queue_set, - transfer_queue_set, host_allocator, out_device); + /*driver=*/NULL, identifier, enabled_features, options, instance, + physical_device, logical_device_handle, &enabled_device_extensions, + compute_queue_set, transfer_queue_set, host_allocator, out_device); logical_device_handle->ReleaseReference(); return status; @@ -851,9 +941,18 @@ static iree_status_t iree_hal_vulkan_device_create_command_buffer( command_pool = device->dispatch_command_pool; } + // The tracing context is tied to a particular queue so we must select here + // even though ideally we'd do it during submission. This is informational + // only and if the user does provide a different queue affinity during + // submission it just means the commands will be attributed to the wrong + // queue. + CommandQueue* queue = iree_hal_vulkan_device_select_queue( + device, command_categories, queue_affinity); + return iree_hal_vulkan_direct_command_buffer_allocate( device->logical_device, command_pool, mode, command_categories, - queue_affinity, device->descriptor_pool_cache, out_command_buffer); + queue_affinity, queue->tracing_context(), device->descriptor_pool_cache, + out_command_buffer); } static iree_status_t iree_hal_vulkan_device_create_descriptor_set( @@ -956,13 +1055,6 @@ static iree_status_t iree_hal_vulkan_device_wait_semaphores_with_timeout( static iree_status_t iree_hal_vulkan_device_wait_idle_with_deadline( iree_hal_device_t* base_device, iree_time_t deadline_ns) { iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device); - if (deadline_ns == IREE_TIME_INFINITE_FUTURE) { - // Fast path for using vkDeviceWaitIdle, which is usually cheaper (as it - // requires fewer calls into the driver). - return VK_RESULT_TO_STATUS(device->logical_device->syms()->vkDeviceWaitIdle( - *device->logical_device), - "vkDeviceWaitIdle"); - } for (iree_host_size_t i = 0; i < device->queue_count; ++i) { IREE_RETURN_IF_ERROR(device->queues[i]->WaitIdle(deadline_ns)); }