From b30ab07139e62a29c9b15eaa82919b98e97e2374 Mon Sep 17 00:00:00 2001
From: Duc Nguyen <anhduc12101992@gmail.com>
Date: Wed, 11 Nov 2020 14:24:35 +0900
Subject: [PATCH] workaround for gpu mem allocation time problem

---
 jobs.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 list.c |  2 +-
 main.h |  1 +
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/jobs.c b/jobs.c
index 0f5c84e..c28af08 100644
--- a/jobs.c
+++ b/jobs.c
@@ -194,6 +194,23 @@ void s_count_running_jobs(int s)
     send_msg(s, &m);
 }
 
+int s_count_allocating_jobs()
+{
+    int count = 0;
+    struct Job *p;
+
+    /* Count running jobs */
+    p = firstjob;
+    while(p != 0)
+    {
+        if (p->state == ALLOCATING)
+            ++count;
+
+        p = p->next;
+    }
+    return count;
+}
+
 void s_get_label(int s, int jobid)
 {
     struct Job *p = 0;
@@ -255,7 +272,7 @@ int wake_hold_client()
     p = findjob_holding_client();
     if (p)
     {
-        p->state = QUEUED;
+        p->state = (p->gpus) ? ALLOCATING : QUEUED;
         return p->jobid;
     }
     return -1;
@@ -269,6 +286,9 @@ const char * jstate2string(enum Jobstate s)
         case QUEUED:
             jobstate = "queued";
             break;
+        case ALLOCATING:
+            jobstate = "allocating";
+            break;
         case RUNNING:
             jobstate = "running";
             break;
@@ -389,11 +409,11 @@ int s_newjob(int s, struct msg *m)
     p = newjobptr();
 
     p->jobid = jobids++;
+    p->gpus = m->u.newjob.gpus;
     if (count_not_finished_jobs() < max_jobs)
-        p->state = QUEUED;
+        p->state = (p->gpus) ? ALLOCATING : QUEUED;
     else
         p->state = HOLDING_CLIENT;
-    p->gpus = m->u.newjob.gpus;
     p->num_slots = m->u.newjob.num_slots;
     p->store_output = m->u.newjob.store_output;
     p->should_keep_finished = m->u.newjob.should_keep_finished;
@@ -591,12 +611,24 @@ int next_run_job()
     p = firstjob;
     while(p != 0)
     {
-        if (p->state == QUEUED)
+        if (p->state == QUEUED || p->state == ALLOCATING)
         {
             if (p->gpus) {
                 int numFree;
                 /* get number of free GPUs at the moment */
                 getFreeGpuList(&numFree);
+
+                if (numFree > 0) {
+                    /* GPU mem takes some time to be allocated
+                     * if there are many processes in queue,
+                     * they can use the same GPU
+                     * TODO: this is ugly */
+                    sleep(60);
+                } else {
+                    p = p->next;
+                    continue;
+                }
+
                 if (numFree < p->gpus) {
                     /* if fewer GPUs than required then next */
                     p = p->next;
@@ -610,7 +642,8 @@ int next_run_job()
                 /* We won't try to run any job do_depending on an unfinished
                  * job */
                 if (do_depend_job != NULL &&
-                    (do_depend_job->state == QUEUED || do_depend_job->state == RUNNING))
+                    (do_depend_job->state == QUEUED || do_depend_job->state == RUNNING ||
+                    do_depend_job->state == ALLOCATING))
                 {
                     /* Next try */
                     p = p->next;
diff --git a/list.c b/list.c
index 19daadb..0b0ee59 100644
--- a/list.c
+++ b/list.c
@@ -65,7 +65,7 @@ static const char * ofilename_shown(const struct Job *p)
         output_filename = "(no output)";
     } else if (p->store_output)
     {
-        if (p->state == QUEUED)
+        if (p->state == QUEUED || p->state == ALLOCATING)
         {
             output_filename = "(file)";
         } else
diff --git a/main.h b/main.h
index 3c0cf44..569b62d 100644
--- a/main.h
+++ b/main.h
@@ -113,6 +113,7 @@ struct msg;
 enum Jobstate
 {
     QUEUED,
+    ALLOCATING,
     RUNNING,
     FINISHED,
     SKIPPED,