diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ac86ab..0b0d6ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,10 @@ project(Task-Spooler C) set(CMAKE_C_STANDARD 11) +set(CMAKE_CUDA_COMPILER $ENV{CUDA_HOME}/bin/nvcc) +find_package(CUDA REQUIRED) +include_directories($ENV{CUDA_HOME}/include) + add_executable( ts client.c @@ -24,6 +28,8 @@ add_executable( tail.c ) +target_link_libraries(ts ${CUDA_LIBRARIES}) + set( TS_PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE @@ -31,4 +37,4 @@ set( WORLD_READ WORLD_EXECUTE ) -install(TARGETS ts DESTINATION /usr/local/bin PERMISSIONS ${TS_PERMISSIONS}) +install(TARGETS ts DESTINATION /usr/local/bin) diff --git a/client.c b/client.c index 33d042d..fa96317 100644 --- a/client.c +++ b/client.c @@ -14,8 +14,6 @@ #include #include "main.h" -float gpuFreeThres = .9; - static void c_end_of_job(const struct Result *res); static void c_wait_job_send(); static void c_wait_running_job_send(); @@ -142,18 +140,12 @@ int c_wait_server_commands() struct Result result; if (command_line.gpus) { int numFree; - int * freeList = getFreeGpuList(&numFree, gpuFreeThres); + int * freeList = getFreeGpuList(&numFree); char tmp[50]; strcpy(tmp, "CUDA_VISIBLE_DEVICES="); for (int i = 0; i < command_line.gpus; i++) { char tmp2[5]; - int gpu = freeList[i]; - if (gpu == -1) { - error("Wrong GPU ID"); - exit(-1); - } - - sprintf(tmp2, "%d", gpu); + sprintf(tmp2, "%d", freeList[i]); strcat(tmp, tmp2); if (i < command_line.gpus - 1) strcat(tmp, ","); diff --git a/gpu.c b/gpu.c index f8101db..8637a6e 100644 --- a/gpu.c +++ b/gpu.c @@ -3,77 +3,26 @@ // #include -#include -#include -#include +#include -#include "main.h" - -char* getfield(char* line, int num, char *delimiter) -{ - char* tok; - char newDelim[3]; - sprintf(newDelim, "%s\n", delimiter); - for (tok = strtok(line, delimiter); - tok && *tok; - tok = strtok(NULL, newDelim)) - { - if (!--num) - return tok; - } - return NULL; -} - -int * getFreeGpuList(int *numFree, float thres) { +int * getFreeGpuList(int *numFree) { int * gpuList; - int j = 0, count = -1; - FILE *stream; - int fd; - int res; - char * fname = "/tmp/tmp-gpu-query"; - char line[1024]; - int stdoutDup = dup(STDOUT_FILENO); - int nDevices = 100; // just a big number - - stream = fopen(fname, "w"); - fd = fileno(stream); - dup2(fd, STDOUT_FILENO); - res = system("nvidia-smi --query-gpu=memory.free,memory.total --format=csv"); - if (res != 0) - error("Cannot exec nvidia-smi"); + int nDevices; + int i, j = 0, count = 0; - close(fd); - fclose(stream); + cudaGetDeviceCount(&nDevices); gpuList = (int *) malloc(nDevices * sizeof(int)); - memset(gpuList, -1, nDevices); - stream = fopen(fname, "r"); - while (fgets(line, 1024, stream)) - { - if (count == -1) { + for (i = 0; i < nDevices; ++i) { + cudaSetDevice(i); + size_t freeMem; + size_t totalMem; + cudaMemGetInfo(&freeMem, &totalMem); + if (freeMem > .9 * totalMem) { + gpuList[j] = i; count++; - continue; - } - - char* tmp = strdup(line); - char * freeMB = getfield(tmp, 1, ","); - tmp = strdup(line); - char * totalMB = getfield(tmp, 2, ","); - int freeMem = atoi(getfield(freeMB, 1, " ")); - int totalMem = atoi(getfield(totalMB, 1, " ")); - - if (((float) freeMem / totalMem) >= thres) { - gpuList[j] = count; j++; } - count++; - free(tmp); } - fclose(stream); - - if (!(remove(fname) == 0)) - error("Cannot remove temp GPU query file"); - - dup2(stdoutDup, STDOUT_FILENO); - *numFree = j; + *numFree = count; return gpuList; } diff --git a/jobs.c b/jobs.c index 6da87cd..ae360d8 100644 --- a/jobs.c +++ b/jobs.c @@ -637,7 +637,7 @@ int next_run_job() if (p->gpus) { int numFree; /* get number of free GPUs at the moment */ - getFreeGpuList(&numFree, gpuFreeThres); + getFreeGpuList(&numFree); /* GPU mem takes some time to be allocated * if there are many processes in queue, diff --git a/main.h b/main.h index e319447..3e98d52 100644 --- a/main.h +++ b/main.h @@ -4,8 +4,6 @@ Please find the license in the provided COPYING file. */ -#include - enum { CMD_LEN=500, @@ -116,7 +114,6 @@ extern struct Command_line command_line; extern int server_socket; extern enum Process_type process_type; extern int server_socket; /* Used in the client */ -extern float gpuFreeThres; struct Msg; @@ -363,4 +360,4 @@ char * get_environment(); int tail_file(const char *fname, int last_lines); /* gpu.c */ -int * getFreeGpuList(int *numFree, float thres); +int * getFreeGpuList(int *numFree);