Skip to content

Commit

Permalink
Added support for another job scheduler.
Browse files Browse the repository at this point in the history
NetworkComputer(NC) now also known as Altair accelerator is a commercial job scheduler tool from Altair/RunTime.
Supports launching builds/run using NC for both interactive and non-interactive modes

Signed-off-by: Venkat Krishnan <[email protected]>
  • Loading branch information
venkatk-ot committed Oct 1, 2024
1 parent 718f28b commit 86ecf39
Show file tree
Hide file tree
Showing 4 changed files with 246 additions and 3 deletions.
4 changes: 4 additions & 0 deletions util/dvsim/LauncherFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from LocalLauncher import LocalLauncher
from LsfLauncher import LsfLauncher
from SgeLauncher import SgeLauncher
from NcLauncher import NcLauncher

try:
from edacloudlauncher.EdaCloudLauncher import EdaCloudLauncher
Expand Down Expand Up @@ -45,6 +46,9 @@ def set_launcher_type(is_local=False):
elif launcher == "sge":
_LAUNCHER_CLS = SgeLauncher

elif launcher == "nc":
_LAUNCHER_CLS = NcLauncher

# These custom launchers are site specific. They may not be committed to
# the open source repo.
elif launcher == "edacloud" and EDACLOUD_LAUNCHER_EXISTS:
Expand Down
219 changes: 219 additions & 0 deletions util/dvsim/NcLauncher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
# Copyright lowRISC contributors (OpenTitan project).
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

import datetime
import logging as log
import os
import shutil
import subprocess
from Launcher import ErrorMessage
from Launcher import Launcher
from Launcher import LauncherError
from utils import rm_path
from utils import VERBOSE


class NcLauncher(Launcher):
"""Implementation of Launcher to launch jobs using altair nc."""

def __init__(self, deploy):
"""Initialize common class members."""

super().__init__(deploy)

# Popen object when launching the job.
self.process = None

def create_run_sh(self, full_path, cmd):
source_file = os.path.join(os.path.dirname(__file__), 'nc_post_cmd.sh')
destination_file = os.path.join(full_path, 'nc_post_cmd.sh')
shutil.copy2(source_file, destination_file)
run_file = os.path.join(full_path, 'run.sh')
rm_path(run_file)
lines = ['#!/bin/sh',
'function realpath {',
' python -c "import os; print (os.path.realpath(\'$1\'))"',
'}',
'MY_FILEPATH=$(realpath "${BASH_SOURCE[0]}")',
'MY_DIR=$( dirname "${MY_FILEPATH}" )',
'cd $MY_DIR',
'echo Launch start : `date`',
'SECONDS=0',
cmd,
'echo Launch end : `date`',
'echo CPU time : $SECONDS sec']
with open(run_file, 'w') as f:
f.write('\n'.join(lines))
os.chmod(run_file, 0o755)

def get_submit_cmd(self, interactive_flags):
exetool = self.deploy.sim_cfg.tool
log_file = self.deploy.get_log_path()
job_name = self.deploy.full_name
cmd = self.deploy.cmd
odir = self.deploy.odir

postcmd = (
f'{odir}/nc_post_cmd.sh >post.log; '
f'cat post.log >>{log_file}'
)

# TODO: These tool-specific names need moving into an hjson config
# file.
if (exetool == 'xcelium'):
license_args = ['-r', 'License:Xcelium_Single_Core/1']
elif (exetool == 'vcs'):
license_args = ['-r', 'License:VCSRuntime_Net/1']
else:
license_args = []

self.create_run_sh(odir, cmd)

return (['nc', 'run',
'-e', 'SNAPSHOT',
'-nodb', '-forcelog',
'-l', log_file,
'-postcmd', postcmd,
'-set', job_name] +
license_args +
interactive_flags +
['--', f'{odir}/run.sh'])

def _do_launch(self):
# Compute the environment for the subprocess by overriding environment
# variables of this process with matching ones from self.deploy.exports
exports = os.environ.copy()
if self.deploy.exports:
exports.update(self.deploy.exports)

# Clear the magic MAKEFLAGS variable from exports if necessary. This
# variable is used by recursive Make calls to pass variables from one
# level to the next. Here, self.cmd is a call to Make but it's
# logically a top-level invocation: we don't want to pollute the flow's
# Makefile with Make variables from any wrapper that called dvsim.
if 'MAKEFLAGS' in exports:
del exports['MAKEFLAGS']

self._dump_env_vars(exports)

if self.deploy.sim_cfg.interactive:
# Interactive: Set RUN_INTERACTIVE to 1
exports['RUN_INTERACTIVE'] = '1'
interactive_flags = ['-I']
cmd_arr = self.get_submit_cmd(interactive_flags)
log.log(VERBOSE, '[Executing]:\n{}\n\n'.format(self.deploy.cmd))
self.process = subprocess.Popen(cmd_arr,
stdin=None,
stdout=None,
stderr=subprocess.STDOUT,
# string mode
universal_newlines=True,
env=exports,
cwd=self.deploy.odir)

# Wait until the process exits
self.process.wait()
else:
try:
interactive_flags = ['-nolog', '-wl']
cmd_arr = self.get_submit_cmd(interactive_flags)
# Using file open instead of with open as
# it is being using in the subprocess.Popen call
# which returns immediately after launching the cmd
# and we want the file to remain open throughout the process
f = open(self.deploy.get_log_path(),
'w',
encoding='UTF-8',
errors='surrogateescape')
f.write('[Executing]:\n{}\n\n'.format(self.deploy.cmd))
f.flush()
self.process = subprocess.Popen(cmd_arr,
bufsize=4096,
universal_newlines=True,
stdout=f,
stderr=f,
env=exports,
cwd=self.deploy.odir)
except subprocess.SubprocessError as e:
raise LauncherError(f'IO Error: {e}\n'
f'See {self.deploy.get_log_path()}')
finally:
self._close_process()

self._link_odir('D')

def poll(self):
"""Check status of the running process.
This returns 'D', 'P', 'F', or 'K'. If 'D', the job is still running.
If 'P', the job finished successfully. If 'F', the job finished with
an error. If 'K' it was killed.
This function must only be called after running self.dispatch_cmd() and
must not be called again once it has returned 'P' or 'F'.
"""

assert self.process is not None
elapsed_time = datetime.datetime.now() - self.start_time
job_runtime_secs = elapsed_time.total_seconds()
if self.process.poll() is None:
timeout_mins = self.deploy.get_timeout_mins()
conditions = [timeout_mins is not None,
job_runtime_secs > (timeout_mins * 60),
not self.deploy.gui]
if all(conditions):
self._kill()
timeout_message = f'Job timed out after {timeout_mins} minutes'
self._post_finish('K',
ErrorMessage(line_number=None,
message=timeout_message,
context=[timeout_message]))
return 'K'
else:
return 'D'

self.exit_code = self.process.returncode
status, err_msg = self._check_status()
self._post_finish(status, err_msg)
return self.status

def _kill(self):
"""Kill the running process.
Try to kill the running process. Send SIGTERM
and SIGKILL.
"""
try:
subprocess.run(['nc', 'stop', '-set', self.deploy.full_name,
'-sig', 'TERM,KILL'],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
except subprocess.CalledProcessError as e:
log.error('Failed to kill job: {}'.format(
e.stderr.decode('utf-8').strip()))

def kill(self):
"""Kill the running process.
This must be called between dispatching and reaping the process (the
same window as poll()).
"""
self._kill()
self._post_finish(
'K',
ErrorMessage(line_number=None, message='Job killed!', context=[]))

def _post_finish(self, status, err_msg):
self._close_process()
self.process = None
super()._post_finish(status, err_msg)

def _close_process(self):
"""Close the file descriptors associated with the process."""

assert self.process
if self.process.stdout:
self.process.stdout.close()
10 changes: 7 additions & 3 deletions util/dvsim/dvsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
import LauncherFactory
import LocalLauncher
import SgeLauncher
import LsfLauncher
import NcLauncher
from CfgFactory import make_cfg
from Deploy import RunTest
from Timer import Timer
Expand Down Expand Up @@ -276,9 +278,9 @@ def parse_args():
# Disable it pending more verbose and automatic solution and document in
# help message
usage='%(prog)s {} [-h] [options]'.format(cfg_metavar),
epilog="Either place the positional argument ahead of the optional args:\n" \
"eg. `dvsim.py {} -i ITEM ITEM` \n" \
"or end a sequence of optional args with `--`:\n" \
epilog="Either place the positional argument ahead of the optional args:\n"
"eg. `dvsim.py {} -i ITEM ITEM` \n"
"or end a sequence of optional args with `--`:\n"
"eg. `dvsim.py -i ITEM ITEM -- {}`\n".format(cfg_metavar, cfg_metavar))

parser.add_argument("cfg",
Expand Down Expand Up @@ -723,6 +725,8 @@ def main():
Timer.print_interval = args.print_interval
LocalLauncher.LocalLauncher.max_parallel = args.max_parallel
SgeLauncher.SgeLauncher.max_parallel = args.max_parallel
LsfLauncher.LsfLauncher.max_parallel = args.max_parallel
NcLauncher.NcLauncher.max_parallel = args.max_parallel
Launcher.Launcher.max_odirs = args.max_odirs
LauncherFactory.set_launcher_type(args.local)

Expand Down
16 changes: 16 additions & 0 deletions util/dvsim/nc_post_cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#! /bin/sh
# Copyright lowRISC contributors (OpenTitan project).
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

printf "Job $1 exited with code $2\n"
if [ $2 -eq 0 ]
then
printf "Successfully completed\n"
printf "Job $1 done\n"
else
printf "Job $1 failed with error code $2\n"
fi
cwd=$(realpath .)
printf "<$cwd> was used as the working directory\n"
exit $2

0 comments on commit 86ecf39

Please sign in to comment.