Skip to content

Commit

Permalink
Add dry command regression tests
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jul 3, 2024
1 parent 00e5737 commit 0845bbf
Show file tree
Hide file tree
Showing 9 changed files with 1,252 additions and 8 deletions.
238 changes: 238 additions & 0 deletions milabench/cli/dry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
import os
import shlex
from contextlib import contextmanager
from dataclasses import dataclass

import voir.instruments.gpu as voirgpu
import yaml
from coleo import Option, tooled

from ..common import get_multipack
from ..multi import make_execution_plan


class MockDeviceSMI:
def __init__(self, ngpu, capacity) -> None:
self.devices = [i for i in range(ngpu)]
self.used = [1]
self.total = [capacity]
self.util = [40]
self.temp = [35]
self.power = [225]

def get_gpu_info(self, device):
return {
"device": device,
"product": "MockDevice",
"memory": {
"used": self.used[0] // (1024**2),
"total": self.total[0] // (1024**2),
},
"utilization": {
"compute": float(self.util[0]) / 100,
"memory": self.used[0] / self.total[0],
},
"temperature": self.temp[0],
"power": self.power[0],
"selection_variable": "CUDA_VISIBLE_DEVICES",
}

@property
def arch(self):
return "mock"

@property
def visible_devices(self):
return os.environ.get("CUDA_VISIBLE_DEVICES", None)

def get_gpus_info(self, selection=None):
gpus = dict()
for device in self.devices:
if (selection is None) or (selection and str(device) in selection):
gpus[device] = self.get_gpu_info(device)

return gpus

def close(self):
pass


@contextmanager
def assume_gpu(ngpu=1, capacity=80000, enabled=False):
if enabled:
old = voirgpu.DEVICESMI
voirgpu.DEVICESMI = MockDeviceSMI(ngpu, capacity)
yield
voirgpu.DEVICESMI = old
else:
yield


class BashGenerator:
def __init__(self) -> None:
self.indent = 0
self.background_mode = False
self.print("#!/bin/sh")
self.print("")

def print(self, *args, **kwargs):
print(" " * self.indent, end="")
print(*args, **kwargs)

def section(self, title):
self.echo("---")
self.echo(title)
self.echo("=" * len(title))

def echo(self, msg):
self.print(f'echo "{msg}"')

def comment(self, cmt):
self.print(f"# {cmt}")

def env(self, env):
for k, v in env.items():
self.print(f"export {k}={shlex.quote(v)}")
self.print()

@contextmanager
def subshell(self):
self.print("time (")
self.indent += 1
yield
self.indent -= 1
self.print(")")

@contextmanager
def background(self):
self.background_mode = True
yield
self.print("wait")
self.background_mode = False

def command(self, *args, env=None, **kwargs):
prefix = []
if env is not None:
for k, v in env.items():
prefix.append(f"{k}={v}")

prefix = " ".join(prefix)
sufix = ""
if True:
sufix = "&"

frags = [prefix] + [str(a) for a in args] + [sufix]

self.print(" ".join(filter(lambda x: x != "", frags)))


# fmt: off
@dataclass
class Arguments:
nnodes: int = 2
ngpu: int = 8
capacity: int = 80000
withenv: bool = True
usevoir: bool = False
# fmt: on


@tooled
def arguments():
ngpu: Option & int = 8
capacity: Option & int = 80000
nnodes: Option & int = 2

# [negate]
withenv: Option & bool = True

# [negate]
usevoir: Option & bool = True
return Arguments(nnodes, ngpu, capacity, withenv, usevoir)


@tooled
def multipack_args(conf: Arguments):
from ..common import arguments as multiargs

args = multiargs()
args.system = "system_tmp.yaml"

system = {
"system": {
"arch": "cuda",
"nodes": [
{
"name": str(i),
"ip": f"192.168.0.{i + 10}" if i != 0 else "127.0.0.1",
"user": "username",
"main": i == 0,
"port": 22,
}
for i in range(conf.nnodes)
],
}
}

with open("system_tmp.yaml", "w") as file:
system = yaml.dump(system)
file.write(system)

return args


@tooled
def cli_dry(args=None):
"""Generate dry commands to execute the bench standalone"""
from ..commands import set_voir
from ..system import set_offline
from ..sizer import resolve_argv, scale_argv

if args is None:
args = arguments()

set_offline(True)
set_voir(args.usevoir)

with assume_gpu(args.ngpu, args.capacity, enabled=True):
repeat = 1
mp = get_multipack(multipack_args(args), run_name="dev")
gen = BashGenerator()

first_pack = True
for index in range(repeat):
for pack in mp.packs.values():
if first_pack and args.withenv:
first_pack = False
gen.section("Virtual Env")

venv = pack.core._nox_session.env["VIRTUAL_ENV"]
gen.env({"VIRTUAL_ENV": venv})
gen.print("source $VIRTUAL_ENV/bin/activate")

gen.section("Milabench")
gen.env(pack.make_env())

exec_plan = make_execution_plan(pack, index, repeat)

gen.section(pack.config["name"])
with gen.subshell():
with gen.background():
for pack, argv, _ in exec_plan.commands():

sized_args = scale_argv(pack, argv)
final_args = resolve_argv(pack, sized_args)

gen.command(*final_args, env=pack.config.get("env", {}))

print()

try:
os.remove("system_tmp.yaml")
except:
pass



if __name__ == "__main__":
cli_dry()
31 changes: 27 additions & 4 deletions milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,16 +460,35 @@ def _argv(self, **kwargs):
devices = self.pack.config.get("devices", [])
nproc = len(devices)
if nproc > 1:
argv = [*super()._argv(**kwargs), f"--nproc_per_node={nproc}", "--"]
argv = [*super()._argv(**kwargs), f"--nproc_per_node={nproc}"]

# Check if the sub-executor targets a module or not
cmd = next(iter(self.exec.argv()), None)
# if the command exists and it is not a path assume it is a module
if cmd and not XPath(cmd).exists():
argv.append("-m")

if cmd:
# python or voir; tell it to not prepend python since we are doing it
if cmd in ("python", "voir"):
argv.append("--no-python")

# if the command exists and it is not a path assume it is a module
# script is not a file, maybe it is a module
elif not XPath(cmd).exists():
argv.append("-m")

# everything after torchrun args are script args
argv.append("--")
return argv
return []


use_voir = True


def set_voir(val):
global use_voir
use_voir = val


class VoirCommand(WrapperCommand):
"""Execute an `Command` through voir
Expand All @@ -491,6 +510,10 @@ def __init__(self, executor: SingleCmdCommand, *voir_argv, **kwargs) -> None:
def _argv(self, **kwargs) -> List:
argv = super()._argv(**kwargs)

if not use_voir:
# voir replace python
return ["python"]

if voirconf := self.pack.config.get("voir", None):
hsh = md5(str(voirconf).encode("utf8"))
voirconf_file = (
Expand Down
5 changes: 4 additions & 1 deletion milabench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,14 @@ def is_selected(defn, args):
)

def _get_multipack(
args: CommonArguments = arguments(),
args: CommonArguments = None,
run_name=None,
overrides={},
return_config=False,
):
if args is None:
args = arguments()

if args.config is None:
args.config = os.environ.get("MILABENCH_CONFIG", None)

Expand Down
2 changes: 1 addition & 1 deletion milabench/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def machine_metadata(pack=None):
"machine": uname.machine,
},
"accelerators": gpus,
"date": datetime.datetime.now(datetime.UTC).timestamp(),
"date": datetime.datetime.utcnow().timestamp(),
"milabench": retrieve_git_versions(
__tag__,
__commit__,
Expand Down
21 changes: 19 additions & 2 deletions milabench/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,13 @@ def _resolve_ip(ip):
try:
hostname, aliaslist, ipaddrlist = socket.gethostbyaddr(ip)
lazy_raise = None

except socket.herror as err:
hostname = ip
aliaslist = []
ipaddrlist = []
lazy_raise = err

except socket.gaierror as err:
# Get Addr Info (GAI) Error
#
Expand Down Expand Up @@ -195,6 +202,15 @@ def _fix_weird(hostname):
return hostname


# If true that means we cannot resolve the ip addresses
# so we ignore errors
offline = False

def set_offline(value):
global offline
offline = value


def resolve_addresses(nodes):
# Note: it is possible for self to be none
# if we are running milabench on a node that is not part of the system
Expand Down Expand Up @@ -229,8 +245,9 @@ def resolve_addresses(nodes):

# if self is node we might be outisde the cluster
# which explains why we could not resolve the IP of the nodes
if self is not None and lazy_raise:
raise RuntimeError("Could not resolve node ip") from lazy_raise
if not offline:
if self is not None and lazy_raise:
raise RuntimeError("Could not resolve node ip") from lazy_raise

return self

Expand Down
13 changes: 13 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ def get_config(name):
return get_config



@pytest.fixture
def official_config():
def get_config(name):
return here / ".." / "config" / f"{name}.yaml"
return get_config


@pytest.fixture
def standard_config(official_config):
return official_config("standard")


@pytest.fixture
def replayfolder():
return here / "replays"
Expand Down
Loading

0 comments on commit 0845bbf

Please sign in to comment.