From bd9a113a8de10d2e4367513ddec198aa86ecf9ed Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 31 Mar 2021 14:38:29 -0700 Subject: [PATCH] Adding command buffer queue affinity. (#5265) This is needed for the CPU backend and vulkan tracing. It's a somewhat unfortunate dependency issue and ideally we wouldn't need to track the queue during recording however for one shot recordings we know at compile-time what queue a stream will be submitted to. We can always change the behavior to ignore/treat as suggestion the affinity when recording reusable command buffers. --- iree/hal/command_buffer.c | 3 +- iree/hal/command_buffer.h | 21 +++++++++++ iree/hal/cts/command_buffer_test.cc | 18 ++++++--- iree/hal/cts/event_test.cc | 9 +++-- iree/hal/cts/semaphore_submission_test.cc | 9 +++-- iree/hal/cuda/cuda_device.c | 9 +++-- iree/hal/cuda/graph_command_buffer.c | 3 ++ iree/hal/cuda/graph_command_buffer.h | 1 + iree/hal/device.c | 2 +- iree/hal/device.h | 5 ++- iree/hal/local/task_command_buffer.c | 3 ++ iree/hal/local/task_command_buffer.h | 1 + iree/hal/local/task_device.c | 46 ++++++++++++----------- iree/hal/vulkan/direct_command_buffer.cc | 3 ++ iree/hal/vulkan/direct_command_buffer.h | 1 + iree/hal/vulkan/vulkan_device.cc | 37 +++++++++--------- iree/modules/hal/hal_module.c | 3 +- 17 files changed, 115 insertions(+), 59 deletions(-) diff --git a/iree/hal/command_buffer.c b/iree/hal/command_buffer.c index e412480582ae..22814b25ac1c 100644 --- a/iree/hal/command_buffer.c +++ b/iree/hal/command_buffer.c @@ -26,6 +26,7 @@ IREE_HAL_API_RETAIN_RELEASE(command_buffer); IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_command_buffer_create( iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer) { IREE_ASSERT_ARGUMENT(device); IREE_ASSERT_ARGUMENT(out_command_buffer); @@ -33,7 +34,7 @@ IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_command_buffer_create( IREE_TRACE_ZONE_BEGIN(z0); iree_status_t status = IREE_HAL_VTABLE_DISPATCH(device, iree_hal_device, create_command_buffer)( - device, mode, command_categories, out_command_buffer); + device, mode, command_categories, queue_affinity, out_command_buffer); IREE_TRACE_ZONE_END(z0); return status; } diff --git a/iree/hal/command_buffer.h b/iree/hal/command_buffer.h index 6031b2ace031..044e4b8c6dea 100644 --- a/iree/hal/command_buffer.h +++ b/iree/hal/command_buffer.h @@ -64,6 +64,22 @@ enum iree_hal_command_category_e { }; typedef uint32_t iree_hal_command_category_t; +// A bitmask indicating affinity for a submission to use a particular set of +// queues. +// +// Upon submission the queue is selected based on the flags set in +// |command_categories| and the |queue_affinity|. As the number of available +// queues can vary the |queue_affinity| is used to hash into the available +// queues for the required categories. For example if 2 queues support transfer +// commands and the affinity is 5 the resulting queue could be index hash(5)=1. +// The affinity can thus be treated as just a way to indicate whether two +// submissions must be placed on to the same queue. Note that the exact hashing +// function is implementation dependent. +typedef uint64_t iree_hal_queue_affinity_t; + +// Specifies that any queue may be selected. +#define IREE_HAL_QUEUE_AFFINITY_ANY ((iree_hal_queue_affinity_t)(-1)) + // Bitfield specifying which execution stage a barrier should start/end at. // // Maps to VkPipelineStageFlagBits. @@ -193,9 +209,14 @@ typedef struct iree_hal_command_buffer_s iree_hal_command_buffer_t; // Creates a command buffer ready to begin recording, possibly reusing an // existing one from the |device| pool. +// +// |queue_affinity| specifies the device queues the command buffer may be +// submitted to. The queue affinity provided to iree_hal_device_queue_submit +// must match or be a subset of the |queue_affinity|. IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_command_buffer_create( iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer); // Retains the given |command_buffer| for the caller. diff --git a/iree/hal/cts/command_buffer_test.cc b/iree/hal/cts/command_buffer_test.cc index 4d526ee93830..fc9da82b583d 100644 --- a/iree/hal/cts/command_buffer_test.cc +++ b/iree/hal/cts/command_buffer_test.cc @@ -41,7 +41,8 @@ TEST_P(CommandBufferTest, Create) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); EXPECT_TRUE((iree_hal_command_buffer_allowed_categories(command_buffer) & IREE_HAL_COMMAND_CATEGORY_DISPATCH) == @@ -54,7 +55,8 @@ TEST_P(CommandBufferTest, BeginEnd) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); @@ -66,7 +68,8 @@ TEST_P(CommandBufferTest, SubmitEmpty) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); @@ -81,7 +84,8 @@ TEST_P(CommandBufferTest, FillBufferWithRepeatedBytes) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_TRANSFER, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); iree_hal_buffer_t* device_buffer; IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer( @@ -141,7 +145,8 @@ TEST_P(CommandBufferTest, CopyWholeBuffer) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_TRANSFER, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); // Create and fill a host buffer. iree_hal_buffer_t* host_buffer; @@ -192,7 +197,8 @@ TEST_P(CommandBufferTest, CopySubBuffer) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_TRANSFER, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); iree_hal_buffer_t* device_buffer; IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer( diff --git a/iree/hal/cts/event_test.cc b/iree/hal/cts/event_test.cc index b7282773c55f..72ef2b24c757 100644 --- a/iree/hal/cts/event_test.cc +++ b/iree/hal/cts/event_test.cc @@ -36,7 +36,8 @@ TEST_P(EventTest, SignalAndReset) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_signal_event( @@ -60,10 +61,12 @@ TEST_P(EventTest, SubmitWithChainedCommandBuffers) { iree_hal_command_buffer_t* command_buffer_2; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer_1)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer_1)); IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer_2)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer_2)); // First command buffer signals the event when it completes. IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer_1)); diff --git a/iree/hal/cts/semaphore_submission_test.cc b/iree/hal/cts/semaphore_submission_test.cc index 5cec41f38b3d..336b3619077d 100644 --- a/iree/hal/cts/semaphore_submission_test.cc +++ b/iree/hal/cts/semaphore_submission_test.cc @@ -57,7 +57,8 @@ TEST_P(SemaphoreSubmissionTest, SubmitAndSignal) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); @@ -94,7 +95,8 @@ TEST_P(SemaphoreSubmissionTest, SubmitWithWait) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); @@ -142,7 +144,8 @@ TEST_P(SemaphoreSubmissionTest, SubmitWithMultipleSemaphores) { iree_hal_command_buffer_t* command_buffer; IREE_ASSERT_OK(iree_hal_command_buffer_create( device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, - IREE_HAL_COMMAND_CATEGORY_DISPATCH, &command_buffer)); + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); diff --git a/iree/hal/cuda/cuda_device.c b/iree/hal/cuda/cuda_device.c index 97c771b90ddb..c529999e723d 100644 --- a/iree/hal/cuda/cuda_device.c +++ b/iree/hal/cuda/cuda_device.c @@ -155,10 +155,12 @@ static iree_hal_allocator_t* iree_hal_cuda_device_allocator( static iree_status_t iree_hal_cuda_device_create_command_buffer( iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer) { iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device); return iree_hal_cuda_graph_command_buffer_allocate( - &device->context_wrapper, mode, command_categories, out_command_buffer); + &device->context_wrapper, mode, command_categories, queue_affinity, + out_command_buffer); } static iree_status_t iree_hal_cuda_device_create_descriptor_set( @@ -218,8 +220,9 @@ static iree_status_t iree_hal_cuda_device_create_semaphore( static iree_status_t iree_hal_cuda_device_queue_submit( iree_hal_device_t* base_device, - iree_hal_command_category_t command_categories, uint64_t queue_affinity, - iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) { + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count, + const iree_hal_submission_batch_t* batches) { iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device); for (int i = 0; i < batch_count; i++) { for (int j = 0; j < batches[i].command_buffer_count; j++) { diff --git a/iree/hal/cuda/graph_command_buffer.c b/iree/hal/cuda/graph_command_buffer.c index bbc5db7cd0ab..7fa0822c42e3 100644 --- a/iree/hal/cuda/graph_command_buffer.c +++ b/iree/hal/cuda/graph_command_buffer.c @@ -28,6 +28,7 @@ typedef struct { iree_hal_cuda_context_wrapper_t* context; iree_hal_command_buffer_mode_t mode; iree_hal_command_category_t allowed_categories; + iree_hal_queue_affinity_t queue_affinity; CUgraph graph; CUgraphExec exec; // Keep track of the last node added to the command buffer as we are currently @@ -52,6 +53,7 @@ iree_status_t iree_hal_cuda_graph_command_buffer_allocate( iree_hal_cuda_context_wrapper_t* context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer) { IREE_ASSERT_ARGUMENT(context); IREE_ASSERT_ARGUMENT(out_command_buffer); @@ -72,6 +74,7 @@ iree_status_t iree_hal_cuda_graph_command_buffer_allocate( command_buffer->context = context; command_buffer->mode = mode; command_buffer->allowed_categories = command_categories; + command_buffer->queue_affinity = queue_affinity; command_buffer->graph = graph; command_buffer->exec = NULL; command_buffer->last_node = NULL; diff --git a/iree/hal/cuda/graph_command_buffer.h b/iree/hal/cuda/graph_command_buffer.h index 0edac78824d2..c9f6ccf61d16 100644 --- a/iree/hal/cuda/graph_command_buffer.h +++ b/iree/hal/cuda/graph_command_buffer.h @@ -29,6 +29,7 @@ iree_status_t iree_hal_cuda_graph_command_buffer_allocate( iree_hal_cuda_context_wrapper_t* context, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer); // Returns the native cuda graph associated to the command buffer. diff --git a/iree/hal/device.c b/iree/hal/device.c index 5aeddec8b119..a626cc85fd4d 100644 --- a/iree/hal/device.c +++ b/iree/hal/device.c @@ -42,7 +42,7 @@ iree_hal_device_allocator(iree_hal_device_t* device) { IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_device_queue_submit( iree_hal_device_t* device, iree_hal_command_category_t command_categories, - uint64_t queue_affinity, iree_host_size_t batch_count, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) { IREE_ASSERT_ARGUMENT(device); IREE_ASSERT_ARGUMENT(!batch_count || batches); diff --git a/iree/hal/device.h b/iree/hal/device.h index 2357951365a9..2bb6de2331dd 100644 --- a/iree/hal/device.h +++ b/iree/hal/device.h @@ -171,7 +171,7 @@ iree_hal_device_allocator(iree_hal_device_t* device); // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/vkQueueSubmit.html IREE_API_EXPORT iree_status_t IREE_API_CALL iree_hal_device_queue_submit( iree_hal_device_t* device, iree_hal_command_category_t command_categories, - uint64_t queue_affinity, iree_host_size_t batch_count, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches); // Blocks the caller until the semaphores reach or exceed the specified payload @@ -242,6 +242,7 @@ typedef struct { iree_status_t(IREE_API_PTR* create_command_buffer)( iree_hal_device_t* device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer); iree_status_t(IREE_API_PTR* create_descriptor_set)( @@ -276,7 +277,7 @@ typedef struct { iree_status_t(IREE_API_PTR* queue_submit)( iree_hal_device_t* device, iree_hal_command_category_t command_categories, - uint64_t queue_affinity, iree_host_size_t batch_count, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches); iree_status_t(IREE_API_PTR* wait_semaphores_with_deadline)( diff --git a/iree/hal/local/task_command_buffer.c b/iree/hal/local/task_command_buffer.c index 9d84fb37e4ae..15d0b1fd7fad 100644 --- a/iree/hal/local/task_command_buffer.c +++ b/iree/hal/local/task_command_buffer.c @@ -42,6 +42,7 @@ typedef struct { iree_task_scope_t* scope; iree_hal_command_buffer_mode_t mode; iree_hal_command_category_t allowed_categories; + iree_hal_queue_affinity_t queue_affinity; // Arena used for all allocations; references the shared device block pool. iree_arena_allocator_t arena; @@ -108,6 +109,7 @@ iree_status_t iree_hal_task_command_buffer_create( iree_hal_device_t* device, iree_task_scope_t* scope, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_arena_block_pool_t* block_pool, iree_hal_command_buffer_t** out_command_buffer) { IREE_ASSERT_ARGUMENT(device); @@ -140,6 +142,7 @@ iree_status_t iree_hal_task_command_buffer_create( command_buffer->scope = scope; command_buffer->mode = mode; command_buffer->allowed_categories = command_categories; + command_buffer->queue_affinity = queue_affinity; iree_arena_initialize(block_pool, &command_buffer->arena); iree_task_list_initialize(&command_buffer->root_tasks); iree_task_list_initialize(&command_buffer->leaf_tasks); diff --git a/iree/hal/local/task_command_buffer.h b/iree/hal/local/task_command_buffer.h index ece73886094c..44c0ccc8c27b 100644 --- a/iree/hal/local/task_command_buffer.h +++ b/iree/hal/local/task_command_buffer.h @@ -29,6 +29,7 @@ iree_status_t iree_hal_task_command_buffer_create( iree_hal_device_t* device, iree_task_scope_t* scope, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_arena_block_pool_t* block_pool, iree_hal_command_buffer_t** out_command_buffer); diff --git a/iree/hal/local/task_device.c b/iree/hal/local/task_device.c index 0e10e965928d..b85254d69033 100644 --- a/iree/hal/local/task_device.c +++ b/iree/hal/local/task_device.c @@ -197,17 +197,31 @@ static iree_hal_allocator_t* iree_hal_task_device_allocator( return device->device_allocator; } +// Returns the queue index to submit work to based on the |queue_affinity|. +// +// If we wanted to have dedicated transfer queues we'd fork off based on +// command_categories. For now all queues are general purpose. +static iree_host_size_t iree_hal_task_device_select_queue( + iree_hal_task_device_t* device, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity) { + // TODO(benvanik): evaluate if we want to obscure this mapping a bit so that + // affinity really means "equivalent affinities map to equivalent queues" and + // not a specific queue index. + return queue_affinity % device->queue_count; +} + static iree_status_t iree_hal_task_device_create_command_buffer( iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer) { iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device); - // TODO(benvanik): prevent the need for taking a scope here. We need it to - // construct the tasks as we record but unfortunately then that means we would - // need to know which queue we'd be submitting against ahead of time. + iree_host_size_t queue_index = iree_hal_task_device_select_queue( + device, command_categories, queue_affinity); return iree_hal_task_command_buffer_create( - base_device, &device->queues[0].scope, mode, command_categories, - &device->large_block_pool, out_command_buffer); + base_device, &device->queues[queue_index].scope, mode, command_categories, + queue_affinity, &device->large_block_pool, out_command_buffer); } static iree_status_t iree_hal_task_device_create_descriptor_set( @@ -264,26 +278,14 @@ static iree_status_t iree_hal_task_device_create_semaphore( device->host_allocator, out_semaphore); } -// Returns the queue index to submit work to based on the |queue_affinity|. -// -// If we wanted to have dedicated transfer queues we'd fork off based on -// command_categories. For now all queues are general purpose. -static iree_host_size_t iree_hal_device_select_queue( - iree_hal_task_device_t* device, - iree_hal_command_category_t command_categories, uint64_t queue_affinity) { - // TODO(benvanik): evaluate if we want to obscure this mapping a bit so that - // affinity really means "equivalent affinities map to equivalent queues" and - // not a specific queue index. - return queue_affinity % device->queue_count; -} - static iree_status_t iree_hal_task_device_queue_submit( iree_hal_device_t* base_device, - iree_hal_command_category_t command_categories, uint64_t queue_affinity, - iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) { + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count, + const iree_hal_submission_batch_t* batches) { iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device); - iree_host_size_t queue_index = - iree_hal_device_select_queue(device, command_categories, queue_affinity); + iree_host_size_t queue_index = iree_hal_task_device_select_queue( + device, command_categories, queue_affinity); return iree_hal_task_queue_submit(&device->queues[queue_index], batch_count, batches); } diff --git a/iree/hal/vulkan/direct_command_buffer.cc b/iree/hal/vulkan/direct_command_buffer.cc index 47dd16fd8d84..ca16c8a1c53b 100644 --- a/iree/hal/vulkan/direct_command_buffer.cc +++ b/iree/hal/vulkan/direct_command_buffer.cc @@ -35,6 +35,7 @@ typedef struct { VkDeviceHandle* logical_device; iree_hal_command_buffer_mode_t mode; iree_hal_command_category_t allowed_categories; + iree_hal_queue_affinity_t queue_affinity; VkCommandPoolHandle* command_pool; VkCommandBuffer handle; @@ -66,6 +67,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( iree::hal::vulkan::VkCommandPoolHandle* command_pool, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache, iree_hal_command_buffer_t** out_command_buffer) { IREE_ASSERT_ARGUMENT(logical_device); @@ -95,6 +97,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( command_buffer->logical_device = logical_device; command_buffer->mode = mode; command_buffer->allowed_categories = command_categories; + command_buffer->queue_affinity = queue_affinity; command_buffer->command_pool = command_pool; command_buffer->handle = handle; command_buffer->syms = logical_device->syms().get(); diff --git a/iree/hal/vulkan/direct_command_buffer.h b/iree/hal/vulkan/direct_command_buffer.h index 704609352309..858b521e97a4 100644 --- a/iree/hal/vulkan/direct_command_buffer.h +++ b/iree/hal/vulkan/direct_command_buffer.h @@ -29,6 +29,7 @@ iree_status_t iree_hal_vulkan_direct_command_buffer_allocate( iree::hal::vulkan::VkCommandPoolHandle* command_pool, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache, iree_hal_command_buffer_t** out_command_buffer); diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc index 25a3b2b2c9b0..32f14b404321 100644 --- a/iree/hal/vulkan/vulkan_device.cc +++ b/iree/hal/vulkan/vulkan_device.cc @@ -817,9 +817,25 @@ static iree_hal_allocator_t* iree_hal_vulkan_device_allocator( return device->device_allocator; } +// Returns the queue to submit work to based on the |queue_affinity|. +static CommandQueue* iree_hal_vulkan_device_select_queue( + iree_hal_vulkan_device_t* device, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity) { + // TODO(benvanik): meaningful heuristics for affinity. We don't generate + // anything from the compiler that uses multiple queues and until we do it's + // best not to do anything too clever here. + if (command_categories == IREE_HAL_COMMAND_CATEGORY_TRANSFER) { + return device + ->transfer_queues[queue_affinity % device->transfer_queue_count]; + } + return device->dispatch_queues[queue_affinity % device->dispatch_queue_count]; +} + static iree_status_t iree_hal_vulkan_device_create_command_buffer( iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_hal_command_buffer_t** out_command_buffer) { iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device); @@ -837,7 +853,7 @@ static iree_status_t iree_hal_vulkan_device_create_command_buffer( return iree_hal_vulkan_direct_command_buffer_allocate( device->logical_device, command_pool, mode, command_categories, - device->descriptor_pool_cache, out_command_buffer); + queue_affinity, device->descriptor_pool_cache, out_command_buffer); } static iree_status_t iree_hal_vulkan_device_create_descriptor_set( @@ -901,24 +917,11 @@ static iree_status_t iree_hal_vulkan_device_create_semaphore( initial_value, out_semaphore); } -// Returns the queue to submit work to based on the |queue_affinity|. -static CommandQueue* iree_hal_vulkan_device_select_queue( - iree_hal_vulkan_device_t* device, - iree_hal_command_category_t command_categories, uint64_t queue_affinity) { - // TODO(benvanik): meaningful heuristics for affinity. We don't generate - // anything from the compiler that uses multiple queues and until we do it's - // best not to do anything too clever here. - if (command_categories == IREE_HAL_COMMAND_CATEGORY_TRANSFER) { - return device - ->transfer_queues[queue_affinity % device->transfer_queue_count]; - } - return device->dispatch_queues[queue_affinity % device->dispatch_queue_count]; -} - static iree_status_t iree_hal_vulkan_device_queue_submit( iree_hal_device_t* base_device, - iree_hal_command_category_t command_categories, uint64_t queue_affinity, - iree_host_size_t batch_count, const iree_hal_submission_batch_t* batches) { + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count, + const iree_hal_submission_batch_t* batches) { iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device); CommandQueue* queue = iree_hal_vulkan_device_select_queue( device, command_categories, queue_affinity); diff --git a/iree/modules/hal/hal_module.c b/iree/modules/hal/hal_module.c index 02f31feac9ff..59d84e459d01 100644 --- a/iree/modules/hal/hal_module.c +++ b/iree/modules/hal/hal_module.c @@ -498,7 +498,8 @@ IREE_VM_ABI_EXPORT(iree_hal_module_command_buffer_create, rii, r) { iree_hal_command_buffer_t* command_buffer = NULL; IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create( - device, modes, command_categories, &command_buffer)); + device, modes, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY, + &command_buffer)); rets->r0 = iree_hal_command_buffer_move_ref(command_buffer); return iree_ok_status(); }