Skip to content

Commit

Permalink
added device visibility
Browse files Browse the repository at this point in the history
  • Loading branch information
justanhduc committed Nov 4, 2021
1 parent de5fd06 commit 31f6ec5
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 21 deletions.
70 changes: 53 additions & 17 deletions gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,37 +35,73 @@ void initGPU() {
error("Failed to shutdown NVML: %s", nvmlErrorString(result));
}

static int getGpuVisibility(int **visibility) {
char* vis = getenv("TS_VISIBLE_DEVICES");
if (vis) {
*visibility = malloc(strlen(vis) * sizeof(int));
int num = strtok_int(vis, ",", *visibility);
return num;
}
return -1;
}

int * getGpuList(int *num) {
int * gpuList;
int *gpuList, *visible;
int i, count = 0;
int numVis;
nvmlReturn_t result;

result = nvmlInit();
if (NVML_SUCCESS != result)
error("Failed to initialize NVML: %s", nvmlErrorString(result));

numVis = getGpuVisibility(&visible);
if (numVis == 0) {
*num = 0;
goto Error;
}

gpuList = (int *) malloc(num_total_gpus * sizeof(int));
for (i = 0; i < num_total_gpus; ++i) {
nvmlMemory_t mem;
nvmlDevice_t dev;
result = nvmlDeviceGetHandleByIndex_v2(i, &dev);
if (result != 0) {
error("Failed to get GPU handle for GPU %d: %s", i, nvmlErrorString(result));
goto Error;
}
if (numVis < 0) {
for (i = 0; i < num_total_gpus; ++i) {
nvmlMemory_t mem;
nvmlDevice_t dev;
result = nvmlDeviceGetHandleByIndex_v2(i, &dev);
if (result != 0) {
error("Failed to get GPU handle for GPU %d: %s", i, nvmlErrorString(result));
goto Error;
}

result = nvmlDeviceGetMemoryInfo(dev, &mem);
if (result != 0) {
error("Failed to get GPU memory for GPU %d: %s", i, nvmlErrorString(result));
goto Error;
result = nvmlDeviceGetMemoryInfo(dev, &mem);
if (result != 0) {
error("Failed to get GPU memory for GPU %d: %s", i, nvmlErrorString(result));
goto Error;
}

if (mem.free > .9 * mem.total)
gpuList[count++] = i;
}
*num = count;
} else {
for (i = 0; i < numVis; i++) {
nvmlMemory_t mem;
nvmlDevice_t dev;
result = nvmlDeviceGetHandleByIndex_v2(visible[i], &dev);
if (result != 0) {
error("Failed to get GPU handle for GPU %d: %s", visible[i], nvmlErrorString(result));
goto Error;
}

if (mem.free < .9 * mem.total)
continue;
result = nvmlDeviceGetMemoryInfo(dev, &mem);
if (result != 0) {
error("Failed to get GPU memory for GPU %d: %s", visible[i], nvmlErrorString(result));
goto Error;
}

gpuList[count++] = i;
if (mem.free > .9 * mem.total)
gpuList[count++] = visible[i];
}
}
*num = count;
result = nvmlShutdown();
if (NVML_SUCCESS != result)
error("Failed to shutdown NVML: %s", nvmlErrorString(result));
Expand Down
6 changes: 2 additions & 4 deletions jobs.c
Original file line number Diff line number Diff line change
Expand Up @@ -681,10 +681,8 @@ int next_run_job() {
int *gpu_ids = (int*) malloc(p->num_gpus * sizeof(int));
while (i < p->num_gpus && j < numFree) {
/* if the prospective GPUs are in used, select the next one */
if (!used_gpus[freeGpuList[j]]) {
gpu_ids[i] = freeGpuList[j];
i++; /* select this GPU */
}
if (!used_gpus[freeGpuList[j]])
gpu_ids[i++] = freeGpuList[j];
j++;
}
/* some GPUs might already be claimed by other jobs, but the system still reports as free -> skip */
Expand Down
3 changes: 3 additions & 0 deletions main.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ enum ExitCodes {
};


/* main.c */
int strtok_int(char* str, char* delim, int* ids);

/* client.c */
void c_new_job();

Expand Down

0 comments on commit 31f6ec5

Please sign in to comment.