From 31f6ec52dfaaa74f6cab6302013a2a57c465541f Mon Sep 17 00:00:00 2001 From: Duc Date: Mon, 25 Oct 2021 17:26:53 +0900 Subject: [PATCH] added device visibility --- gpu.c | 70 ++++++++++++++++++++++++++++++++++++++++++++-------------- jobs.c | 6 ++--- main.h | 3 +++ 3 files changed, 58 insertions(+), 21 deletions(-) diff --git a/gpu.c b/gpu.c index 0a6ed7a..4ac3276 100755 --- a/gpu.c +++ b/gpu.c @@ -35,37 +35,73 @@ void initGPU() { error("Failed to shutdown NVML: %s", nvmlErrorString(result)); } +static int getGpuVisibility(int **visibility) { + char* vis = getenv("TS_VISIBLE_DEVICES"); + if (vis) { + *visibility = malloc(strlen(vis) * sizeof(int)); + int num = strtok_int(vis, ",", *visibility); + return num; + } + return -1; +} + int * getGpuList(int *num) { - int * gpuList; + int *gpuList, *visible; int i, count = 0; + int numVis; nvmlReturn_t result; result = nvmlInit(); if (NVML_SUCCESS != result) error("Failed to initialize NVML: %s", nvmlErrorString(result)); + numVis = getGpuVisibility(&visible); + if (numVis == 0) { + *num = 0; + goto Error; + } + gpuList = (int *) malloc(num_total_gpus * sizeof(int)); - for (i = 0; i < num_total_gpus; ++i) { - nvmlMemory_t mem; - nvmlDevice_t dev; - result = nvmlDeviceGetHandleByIndex_v2(i, &dev); - if (result != 0) { - error("Failed to get GPU handle for GPU %d: %s", i, nvmlErrorString(result)); - goto Error; - } + if (numVis < 0) { + for (i = 0; i < num_total_gpus; ++i) { + nvmlMemory_t mem; + nvmlDevice_t dev; + result = nvmlDeviceGetHandleByIndex_v2(i, &dev); + if (result != 0) { + error("Failed to get GPU handle for GPU %d: %s", i, nvmlErrorString(result)); + goto Error; + } - result = nvmlDeviceGetMemoryInfo(dev, &mem); - if (result != 0) { - error("Failed to get GPU memory for GPU %d: %s", i, nvmlErrorString(result)); - goto Error; + result = nvmlDeviceGetMemoryInfo(dev, &mem); + if (result != 0) { + error("Failed to get GPU memory for GPU %d: %s", i, nvmlErrorString(result)); + goto Error; + } + + if (mem.free > .9 * mem.total) + gpuList[count++] = i; } + *num = count; + } else { + for (i = 0; i < numVis; i++) { + nvmlMemory_t mem; + nvmlDevice_t dev; + result = nvmlDeviceGetHandleByIndex_v2(visible[i], &dev); + if (result != 0) { + error("Failed to get GPU handle for GPU %d: %s", visible[i], nvmlErrorString(result)); + goto Error; + } - if (mem.free < .9 * mem.total) - continue; + result = nvmlDeviceGetMemoryInfo(dev, &mem); + if (result != 0) { + error("Failed to get GPU memory for GPU %d: %s", visible[i], nvmlErrorString(result)); + goto Error; + } - gpuList[count++] = i; + if (mem.free > .9 * mem.total) + gpuList[count++] = visible[i]; + } } - *num = count; result = nvmlShutdown(); if (NVML_SUCCESS != result) error("Failed to shutdown NVML: %s", nvmlErrorString(result)); diff --git a/jobs.c b/jobs.c index af59ec3..117ff48 100755 --- a/jobs.c +++ b/jobs.c @@ -681,10 +681,8 @@ int next_run_job() { int *gpu_ids = (int*) malloc(p->num_gpus * sizeof(int)); while (i < p->num_gpus && j < numFree) { /* if the prospective GPUs are in used, select the next one */ - if (!used_gpus[freeGpuList[j]]) { - gpu_ids[i] = freeGpuList[j]; - i++; /* select this GPU */ - } + if (!used_gpus[freeGpuList[j]]) + gpu_ids[i++] = freeGpuList[j]; j++; } /* some GPUs might already be claimed by other jobs, but the system still reports as free -> skip */ diff --git a/main.h b/main.h index 8ee8a5f..cd52dbd 100755 --- a/main.h +++ b/main.h @@ -217,6 +217,9 @@ enum ExitCodes { }; +/* main.c */ +int strtok_int(char* str, char* delim, int* ids); + /* client.c */ void c_new_job();