Skip to content

Commit

Permalink
used NVML to query GPUs
Browse files Browse the repository at this point in the history
  • Loading branch information
justanhduc committed Nov 26, 2020
1 parent e3f53f6 commit 90d1e20
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 9 deletions.
47 changes: 38 additions & 9 deletions gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,56 @@
//

#include <stdlib.h>
#include <cuda_runtime_api.h>
#include <nvml.h>

#include "main.h"

int * getFreeGpuList(int *numFree) {
int * gpuList;
int nDevices;
unsigned int nDevices;
int i, j = 0, count = 0;
nvmlReturn_t result;

result = nvmlInit();
if (NVML_SUCCESS != result)
error("Failed to initialize NVML: %s", nvmlErrorString(result));

result = nvmlDeviceGetCount_v2(&nDevices);
if (NVML_SUCCESS != result)
error("Failed to get device count: %s", nvmlErrorString(result));

cudaGetDeviceCount(&nDevices);
gpuList = (int *) malloc(nDevices * sizeof(int));
for (i = 0; i < nDevices; ++i) {
cudaSetDevice(i);
size_t freeMem;
size_t totalMem;
cudaMemGetInfo(&freeMem, &totalMem);
if (freeMem > .9 * totalMem) {
nvmlMemory_t mem;
nvmlDevice_t dev;
result = nvmlDeviceGetHandleByIndex_v2(i, &dev);
if (result != 0) {
error("Failed to get GPU handle for GPU %d: %s", i, nvmlErrorString(result));
goto Error;
}

result = nvmlDeviceGetMemoryInfo(dev, &mem);
if (result != 0) {
error("Failed to get GPU memory for GPU %d: %s", i, nvmlErrorString(result));
goto Error;
}

if (mem.free > .9 * mem.total) {
gpuList[j] = i;
count++;
j++;
}
cudaDeviceReset();
}
*numFree = count;
result = nvmlShutdown();
if (NVML_SUCCESS != result)
error("Failed to shutdown NVML: %s", nvmlErrorString(result));

return gpuList;

Error:
result = nvmlShutdown();
if (NVML_SUCCESS != result)
error("Failed to shutdown NVML: %s", nvmlErrorString(result));
return NULL;
}
2 changes: 2 additions & 0 deletions main.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
Please find the license in the provided COPYING file.
*/
#include <stdio.h>

enum {
CMD_LEN = 500,
PROTOCOL_VERSION = 730
Expand Down

0 comments on commit 90d1e20

Please sign in to comment.