From b30ab07139e62a29c9b15eaa82919b98e97e2374 Mon Sep 17 00:00:00 2001 From: Duc Nguyen Date: Wed, 11 Nov 2020 14:24:35 +0900 Subject: [PATCH] workaround for gpu mem allocation time problem --- jobs.c | 43 ++++++++++++++++++++++++++++++++++++++----- list.c | 2 +- main.h | 1 + 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/jobs.c b/jobs.c index 0f5c84e..c28af08 100644 --- a/jobs.c +++ b/jobs.c @@ -194,6 +194,23 @@ void s_count_running_jobs(int s) send_msg(s, &m); } +int s_count_allocating_jobs() +{ + int count = 0; + struct Job *p; + + /* Count running jobs */ + p = firstjob; + while(p != 0) + { + if (p->state == ALLOCATING) + ++count; + + p = p->next; + } + return count; +} + void s_get_label(int s, int jobid) { struct Job *p = 0; @@ -255,7 +272,7 @@ int wake_hold_client() p = findjob_holding_client(); if (p) { - p->state = QUEUED; + p->state = (p->gpus) ? ALLOCATING : QUEUED; return p->jobid; } return -1; @@ -269,6 +286,9 @@ const char * jstate2string(enum Jobstate s) case QUEUED: jobstate = "queued"; break; + case ALLOCATING: + jobstate = "allocating"; + break; case RUNNING: jobstate = "running"; break; @@ -389,11 +409,11 @@ int s_newjob(int s, struct msg *m) p = newjobptr(); p->jobid = jobids++; + p->gpus = m->u.newjob.gpus; if (count_not_finished_jobs() < max_jobs) - p->state = QUEUED; + p->state = (p->gpus) ? ALLOCATING : QUEUED; else p->state = HOLDING_CLIENT; - p->gpus = m->u.newjob.gpus; p->num_slots = m->u.newjob.num_slots; p->store_output = m->u.newjob.store_output; p->should_keep_finished = m->u.newjob.should_keep_finished; @@ -591,12 +611,24 @@ int next_run_job() p = firstjob; while(p != 0) { - if (p->state == QUEUED) + if (p->state == QUEUED || p->state == ALLOCATING) { if (p->gpus) { int numFree; /* get number of free GPUs at the moment */ getFreeGpuList(&numFree); + + if (numFree > 0) { + /* GPU mem takes some time to be allocated + * if there are many processes in queue, + * they can use the same GPU + * TODO: this is ugly */ + sleep(60); + } else { + p = p->next; + continue; + } + if (numFree < p->gpus) { /* if fewer GPUs than required then next */ p = p->next; @@ -610,7 +642,8 @@ int next_run_job() /* We won't try to run any job do_depending on an unfinished * job */ if (do_depend_job != NULL && - (do_depend_job->state == QUEUED || do_depend_job->state == RUNNING)) + (do_depend_job->state == QUEUED || do_depend_job->state == RUNNING || + do_depend_job->state == ALLOCATING)) { /* Next try */ p = p->next; diff --git a/list.c b/list.c index 19daadb..0b0ee59 100644 --- a/list.c +++ b/list.c @@ -65,7 +65,7 @@ static const char * ofilename_shown(const struct Job *p) output_filename = "(no output)"; } else if (p->store_output) { - if (p->state == QUEUED) + if (p->state == QUEUED || p->state == ALLOCATING) { output_filename = "(file)"; } else diff --git a/main.h b/main.h index 3c0cf44..569b62d 100644 --- a/main.h +++ b/main.h @@ -113,6 +113,7 @@ struct msg; enum Jobstate { QUEUED, + ALLOCATING, RUNNING, FINISHED, SKIPPED,