From 31f6ec52dfaaa74f6cab6302013a2a57c465541f Mon Sep 17 00:00:00 2001
From: Duc <anhduc12101992@gmail.com>
Date: Mon, 25 Oct 2021 17:26:53 +0900
Subject: [PATCH] added device visibility

---
 gpu.c  | 70 ++++++++++++++++++++++++++++++++++++++++++++--------------
 jobs.c |  6 ++---
 main.h |  3 +++
 3 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/gpu.c b/gpu.c
index 0a6ed7a..4ac3276 100755
--- a/gpu.c
+++ b/gpu.c
@@ -35,37 +35,73 @@ void initGPU() {
             error("Failed to shutdown NVML: %s", nvmlErrorString(result));
 }
 
+static int getGpuVisibility(int **visibility) {
+    char* vis = getenv("TS_VISIBLE_DEVICES");
+    if (vis) {
+        *visibility = malloc(strlen(vis) * sizeof(int));
+        int num = strtok_int(vis, ",", *visibility);
+        return num;
+    }
+    return -1;
+}
+
 int * getGpuList(int *num) {
-    int * gpuList;
+    int *gpuList, *visible;
     int i, count = 0;
+    int numVis;
     nvmlReturn_t result;
 
     result = nvmlInit();
     if (NVML_SUCCESS != result)
         error("Failed to initialize NVML: %s", nvmlErrorString(result));
 
+    numVis = getGpuVisibility(&visible);
+    if (numVis == 0) {
+        *num = 0;
+        goto Error;
+    }
+
     gpuList = (int *) malloc(num_total_gpus * sizeof(int));
-    for (i = 0; i < num_total_gpus; ++i) {
-        nvmlMemory_t mem;
-        nvmlDevice_t dev;
-        result = nvmlDeviceGetHandleByIndex_v2(i, &dev);
-        if (result != 0) {
-            error("Failed to get GPU handle for GPU %d: %s", i, nvmlErrorString(result));
-            goto Error;
-        }
+    if (numVis < 0) {
+        for (i = 0; i < num_total_gpus; ++i) {
+            nvmlMemory_t mem;
+            nvmlDevice_t dev;
+            result = nvmlDeviceGetHandleByIndex_v2(i, &dev);
+            if (result != 0) {
+                error("Failed to get GPU handle for GPU %d: %s", i, nvmlErrorString(result));
+                goto Error;
+            }
 
-        result = nvmlDeviceGetMemoryInfo(dev, &mem);
-        if (result != 0) {
-            error("Failed to get GPU memory for GPU %d: %s", i, nvmlErrorString(result));
-            goto Error;
+            result = nvmlDeviceGetMemoryInfo(dev, &mem);
+            if (result != 0) {
+                error("Failed to get GPU memory for GPU %d: %s", i, nvmlErrorString(result));
+                goto Error;
+            }
+
+            if (mem.free > .9 * mem.total)
+                gpuList[count++] = i;
         }
+        *num = count;
+    } else {
+        for (i = 0; i < numVis; i++) {
+            nvmlMemory_t mem;
+            nvmlDevice_t dev;
+            result = nvmlDeviceGetHandleByIndex_v2(visible[i], &dev);
+            if (result != 0) {
+                error("Failed to get GPU handle for GPU %d: %s", visible[i], nvmlErrorString(result));
+                goto Error;
+            }
 
-        if (mem.free < .9 * mem.total)
-            continue;
+            result = nvmlDeviceGetMemoryInfo(dev, &mem);
+            if (result != 0) {
+                error("Failed to get GPU memory for GPU %d: %s", visible[i], nvmlErrorString(result));
+                goto Error;
+            }
 
-        gpuList[count++] = i;
+            if (mem.free > .9 * mem.total)
+                gpuList[count++] = visible[i];
+        }
     }
-    *num = count;
     result = nvmlShutdown();
     if (NVML_SUCCESS != result)
         error("Failed to shutdown NVML: %s", nvmlErrorString(result));
diff --git a/jobs.c b/jobs.c
index af59ec3..117ff48 100755
--- a/jobs.c
+++ b/jobs.c
@@ -681,10 +681,8 @@ int next_run_job() {
                 int *gpu_ids = (int*) malloc(p->num_gpus * sizeof(int));
                 while (i < p->num_gpus && j < numFree) {
                     /* if the prospective GPUs are in used, select the next one */
-                    if (!used_gpus[freeGpuList[j]]) {
-                        gpu_ids[i] = freeGpuList[j];
-                        i++;  /* select this GPU */
-                    }
+                    if (!used_gpus[freeGpuList[j]])
+                        gpu_ids[i++] = freeGpuList[j];
                     j++;
                 }
                 /* some GPUs might already be claimed by other jobs, but the system still reports as free -> skip */
diff --git a/main.h b/main.h
index 8ee8a5f..cd52dbd 100755
--- a/main.h
+++ b/main.h
@@ -217,6 +217,9 @@ enum ExitCodes {
 };
 
 
+/* main.c */
+int strtok_int(char* str, char* delim, int* ids);
+
 /* client.c */
 void c_new_job();