Skip to content

Commit

Permalink
used nvidia-smi to find free gpus (unstable)
Browse files Browse the repository at this point in the history
  • Loading branch information
justanhduc committed Nov 13, 2020
1 parent 9ea170e commit a52058b
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 24 deletions.
8 changes: 1 addition & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@ project(Task-Spooler C)

set(CMAKE_C_STANDARD 11)

set(CMAKE_CUDA_COMPILER $ENV{CUDA_HOME}/bin/nvcc)
find_package(CUDA REQUIRED)
include_directories($ENV{CUDA_HOME}/include)

add_executable(
ts
client.c
Expand All @@ -28,13 +24,11 @@ add_executable(
tail.c
)

target_link_libraries(ts ${CUDA_LIBRARIES})

set(
TS_PERMISSIONS
OWNER_WRITE OWNER_READ OWNER_EXECUTE
GROUP_READ GROUP_EXECUTE
WORLD_READ WORLD_EXECUTE
)

install(TARGETS ts DESTINATION /usr/local/bin)
install(TARGETS ts DESTINATION /usr/local/bin PERMISSIONS ${TS_PERMISSIONS})
12 changes: 10 additions & 2 deletions client.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <signal.h>
#include "main.h"

float gpuFreeThres = .9;

static void c_end_of_job(const struct Result *res);
static void c_wait_job_send();
static void c_wait_running_job_send();
Expand Down Expand Up @@ -140,12 +142,18 @@ int c_wait_server_commands()
struct Result result;
if (command_line.gpus) {
int numFree;
int * freeList = getFreeGpuList(&numFree);
int * freeList = getFreeGpuList(&numFree, gpuFreeThres);
char tmp[50];
strcpy(tmp, "CUDA_VISIBLE_DEVICES=");
for (int i = 0; i < command_line.gpus; i++) {
char tmp2[5];
sprintf(tmp2, "%d", freeList[i]);
int gpu = freeList[i];
if (gpu == -1) {
error("Wrong GPU ID");
exit(-1);
}

sprintf(tmp2, "%d", gpu);
strcat(tmp, tmp2);
if (i < command_line.gpus - 1)
strcat(tmp, ",");
Expand Down
77 changes: 64 additions & 13 deletions gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,77 @@
//

#include <stdlib.h>
#include <cuda_runtime_api.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>

int * getFreeGpuList(int *numFree) {
#include "main.h"

char* getfield(char* line, int num, char *delimiter)
{
char* tok;
char newDelim[3];
sprintf(newDelim, "%s\n", delimiter);
for (tok = strtok(line, delimiter);
tok && *tok;
tok = strtok(NULL, newDelim))
{
if (!--num)
return tok;
}
return NULL;
}

int * getFreeGpuList(int *numFree, float thres) {
int * gpuList;
int nDevices;
int i, j = 0, count = 0;
int j = 0, count = -1;
FILE *stream;
int fd;
int res;
char * fname = "/tmp/tmp-gpu-query";
char line[1024];
int stdoutDup = dup(STDOUT_FILENO);
int nDevices = 100; // just a big number

stream = fopen(fname, "w");
fd = fileno(stream);
dup2(fd, STDOUT_FILENO);
res = system("nvidia-smi --query-gpu=memory.free,memory.total --format=csv");
if (res != 0)
error("Cannot exec nvidia-smi");

cudaGetDeviceCount(&nDevices);
close(fd);
fclose(stream);
gpuList = (int *) malloc(nDevices * sizeof(int));
for (i = 0; i < nDevices; ++i) {
cudaSetDevice(i);
size_t freeMem;
size_t totalMem;
cudaMemGetInfo(&freeMem, &totalMem);
if (freeMem > .9 * totalMem) {
gpuList[j] = i;
memset(gpuList, -1, nDevices);
stream = fopen(fname, "r");
while (fgets(line, 1024, stream))
{
if (count == -1) {
count++;
continue;
}

char* tmp = strdup(line);
char * freeMB = getfield(tmp, 1, ",");
tmp = strdup(line);
char * totalMB = getfield(tmp, 2, ",");
int freeMem = atoi(getfield(freeMB, 1, " "));
int totalMem = atoi(getfield(totalMB, 1, " "));

if (((float) freeMem / totalMem) >= thres) {
gpuList[j] = count;
j++;
}
count++;
free(tmp);
}
*numFree = count;
fclose(stream);

if (!(remove(fname) == 0))
error("Cannot remove temp GPU query file");

dup2(stdoutDup, STDOUT_FILENO);
*numFree = j;
return gpuList;
}
2 changes: 1 addition & 1 deletion jobs.c
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@ int next_run_job()
if (p->gpus) {
int numFree;
/* get number of free GPUs at the moment */
getFreeGpuList(&numFree);
getFreeGpuList(&numFree, gpuFreeThres);

/* GPU mem takes some time to be allocated
* if there are many processes in queue,
Expand Down
5 changes: 4 additions & 1 deletion main.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
Please find the license in the provided COPYING file.
*/
#include <sys/time.h>

enum
{
CMD_LEN=500,
Expand Down Expand Up @@ -114,6 +116,7 @@ extern struct Command_line command_line;
extern int server_socket;
extern enum Process_type process_type;
extern int server_socket; /* Used in the client */
extern float gpuFreeThres;

struct Msg;

Expand Down Expand Up @@ -360,4 +363,4 @@ char * get_environment();
int tail_file(const char *fname, int last_lines);

/* gpu.c */
int * getFreeGpuList(int *numFree);
int * getFreeGpuList(int *numFree, float thres);

0 comments on commit a52058b

Please sign in to comment.