Skip to content

Commit

Permalink
feature: events deadlock detection in validation layer
Browse files Browse the repository at this point in the history
Related-To: NEO-12810

Signed-off-by: Chandio, Bibrak Qamar <[email protected]>
  • Loading branch information
bibrak committed Nov 4, 2024
1 parent 3adf129 commit dcaf532
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 40 deletions.
62 changes: 28 additions & 34 deletions samples/zello_events_deadlock/zello_events_deadlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,14 @@ int main(int argc, char *argv[]) {
exit(1);
}

// Create an immediate command list for direct submission
ze_command_queue_desc_t altdesc = {};
altdesc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
// Create an command list for direct submission
ze_command_list_desc_t altdesc = {};
altdesc.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC;

ze_command_list_handle_t command_list = {};
status = zeCommandListCreateImmediate(context, pDevice, &altdesc, &command_list);
status = zeCommandListCreate(context, pDevice, &altdesc, &command_list);
if (status != ZE_RESULT_SUCCESS) {
std::cout << "zeCommandListCreateImmediate Failed with return code: " << to_string(status) << std::endl;
std::cout << "zeCommandListCreate Failed with return code: " << to_string(status) << std::endl;
exit(1);
}

Expand All @@ -155,7 +156,7 @@ int main(int argc, char *argv[]) {
}

std::vector<ze_event_handle_t> event{};
// Two events for memcpy that will form a dependency on a 3rd event
// Three events for memcpy that will form a circular dependency.
event.resize(3);

ze_event_desc_t ev_desc = {};
Expand All @@ -178,9 +179,6 @@ int main(int argc, char *argv[]) {
ze_event_handle_t start_event;
SUCCESS_OR_TERMINATE(zeEventCreate(event_pool, &ev_desc, &start_event)); */

std::cout << std::endl
<< std::endl;

ze_host_mem_alloc_desc_t host_desc = {};
host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
host_desc.pNext = nullptr;
Expand All @@ -193,7 +191,7 @@ int main(int argc, char *argv[]) {
ze_device_mem_alloc_desc_t device_desc = {};
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
device_desc.pNext = nullptr;
device_desc.ordinal = 0;
// device_desc.ordinal = 0;
device_desc.flags = 0;

void *device_mem_ptr = nullptr;
Expand All @@ -203,20 +201,21 @@ int main(int argc, char *argv[]) {
<< std::endl;

// Action_0: Host to Device, is dependent on a future action called Action_2 (see below).
// SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 1 /* 1 */, &event[2] /* &start_event */));
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 0, nullptr));
std::cout << std::endl
<< std::endl;
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 1 /* 1 */, &event[2] /* &start_event */));
// SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[0], 0, nullptr));
/* std::cout << std::endl
<< std::endl; */

// Action_1: Host to Device, is dependent on Action_0
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[1], 1, &event[0]));
std::cout << std::endl
<< std::endl;
/* std::cout << std::endl
<< std::endl; */

// Action_2: Host to Device, is dependent on Action_1. It also creates a deadlock by having Action_0 dependent on it.
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, nullptr /* event[2] */, 1, &event[1]));
std::cout << std::endl
<< std::endl;
// SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, nullptr /* event[2] */, 1, &event[1]));
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(command_list, device_mem_ptr, host_mem_ptr, buffer_size, event[2], 1, &event[1]));
/* std::cout << std::endl
<< std::endl; */

std::cout << "\n\n\n";

Expand All @@ -225,43 +224,38 @@ int main(int argc, char *argv[]) {
ze_command_queue_desc_t command_queue_description{};
command_queue_description.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
command_queue_description.pNext = nullptr;
command_queue_description.ordinal = 0;
command_queue_description.index = 0;
// command_queue_description.ordinal = 0;
// command_queue_description.index = 0;
command_queue_description.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;

ze_command_queue_handle_t command_queue{};
SUCCESS_OR_TERMINATE(zeCommandQueueCreate(context, pDevice, &command_queue_description, &command_queue));

// This segfaults. TODO!!! Fix
// Explicitly break the dependency by signaling the last event.
// zeEventHostSignal(event[2]);

SUCCESS_OR_TERMINATE(zeCommandQueueExecuteCommandLists(command_queue, 1, &command_list, nullptr));

SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(command_queue, UINT64_MAX));

// SUCCESS_OR_TERMINATE(zeEventHostSignal(start_event));

// signal the event from the device and wait for completion

// zeCommandListAppendSignalEvent(command_list, event[0]);
// zeEventHostSynchronize(event[0], UINT64_MAX);

std::cout << "Congratulations, the device completed execution!\n";

SUCCESS_OR_TERMINATE(zeCommandQueueDestroy(command_queue));

// These two hang. TODO!!! Fix
/* SUCCESS_OR_TERMINATE(zeMemFree(context, host_mem_ptr));
SUCCESS_OR_TERMINATE(zeMemFree(context, device_mem_ptr)); */
SUCCESS_OR_TERMINATE(zeMemFree(context, host_mem_ptr));
SUCCESS_OR_TERMINATE(zeMemFree(context, device_mem_ptr));

SUCCESS_OR_TERMINATE(zeEventDestroy(event[0]));
SUCCESS_OR_TERMINATE(zeEventDestroy(event[1]));
SUCCESS_OR_TERMINATE(zeEventDestroy(event[2]));
// SUCCESS_OR_TERMINATE(zeEventDestroy(start_event));

// These these hang. TODO!!! Fix
/* SUCCESS_OR_TERMINATE(zeEventPoolDestroy(event_pool));
SUCCESS_OR_TERMINATE(zeEventPoolDestroy(event_pool));
SUCCESS_OR_TERMINATE(zeCommandListDestroy(command_list));

SUCCESS_OR_TERMINATE(zeContextDestroy(context));*/
SUCCESS_OR_TERMINATE(zeContextDestroy(context));

if (tracing_runtime_enabled) {
std::cout << "Disable Tracing Layer after init" << std::endl;
Expand All @@ -271,6 +265,6 @@ int main(int argc, char *argv[]) {
exit(1);
}
}
std::cout << "Returning with 0 looks like it hangs here ... ???" << std::endl;

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListUpdateMutableComman
ze_event_handle_t hSignalEvent ///< [in][optional] handle of the event to signal on completion
) {
// TODO: Implememt this
// checkForDeadlock("zeCommandListUpdateMutableCommandSignalEventExp", hSignalEvent, 0, nullptr);
checkForDeadlock("zeCommandListUpdateMutableCommandSignalEventExp", hSignalEvent, 0, nullptr);

return ZE_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -530,6 +530,16 @@ eventsDeadlockChecker::ZEeventsDeadlockChecker::zeCommandListImmediateAppendComm
void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::string zeCallDisc, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
int this_action_new_node_id = invalidDagID;

// Check if user is using invalid events, hint if it doesn't exist in eventToDagID
if (eventToDagID.find(hSignalEvent) == eventToDagID.end()) {
std::cerr << "Warning: Wait event " << hSignalEvent << " does not exist in eventToDagID map. It might be an invalid event." << std::endl;
}
for (uint32_t i = 0; i < numWaitEvents; i++) {
if (eventToDagID.find(phWaitEvents[i]) == eventToDagID.end()) {
std::cerr << "Warning: Wait event " << phWaitEvents[i] << " does not exist in eventToDagID map. It might be an invalid event." << std::endl;
}
}

if (hSignalEvent != nullptr) {
auto it = eventToDagID.find(hSignalEvent);
if (it != eventToDagID.end() && it->second != invalidDagID) {
Expand Down Expand Up @@ -592,15 +602,15 @@ void eventsDeadlockChecker::ZEeventsDeadlockChecker::checkForDeadlock(std::strin
// std::cerr << "\t\tThere is already a path from " << this_action_new_node_id << " to " << dagID << ": " << dag.Path(this_action_new_node_id, dagID, 5) << std::endl;
auto path = dag.PathDagIDs(this_action_new_node_id, dagID, 5);

std::string dependencyPrefix = "|\n\t-> ";
std::cerr << "There is already a path from:\n";
std::string spacePrefix = "";
std::cerr << "Warning: There may be a potential event deadlock! There is already a path from:\n";
auto dagIDsInPath = path.first;
std::cerr << getActionDetails(dagIDsInPath[0]) << "\n";
for (uint32_t i = 1; i < dagIDsInPath.size(); i++) {
std::cerr << dependencyPrefix << getActionDetails(dagIDsInPath[i]) << "\n";
std::cerr << spacePrefix << "|\n"
<< spacePrefix << "-> " << getActionDetails(dagIDsInPath[i]) << "\n";
spacePrefix += " ";
}

std::cerr << "\tWarning: There may be a potential event deadlock!" << std::endl;
}
} else {
std::cerr << "eventsDeadlockChecker: zeCommandListAppendMemoryCopyPrologue: Error: Wait event not found in eventToDagID map" << std::endl;
Expand Down

0 comments on commit dcaf532

Please sign in to comment.