diff --git a/milabench/cli/dry.py b/milabench/cli/dry.py new file mode 100644 index 000000000..b9cdafb43 --- /dev/null +++ b/milabench/cli/dry.py @@ -0,0 +1,238 @@ +import os +import shlex +from contextlib import contextmanager +from dataclasses import dataclass + +import voir.instruments.gpu as voirgpu +import yaml +from coleo import Option, tooled + +from ..common import get_multipack +from ..multi import make_execution_plan + + +class MockDeviceSMI: + def __init__(self, ngpu, capacity) -> None: + self.devices = [i for i in range(ngpu)] + self.used = [1] + self.total = [capacity] + self.util = [40] + self.temp = [35] + self.power = [225] + + def get_gpu_info(self, device): + return { + "device": device, + "product": "MockDevice", + "memory": { + "used": self.used[0] // (1024**2), + "total": self.total[0] // (1024**2), + }, + "utilization": { + "compute": float(self.util[0]) / 100, + "memory": self.used[0] / self.total[0], + }, + "temperature": self.temp[0], + "power": self.power[0], + "selection_variable": "CUDA_VISIBLE_DEVICES", + } + + @property + def arch(self): + return "mock" + + @property + def visible_devices(self): + return os.environ.get("CUDA_VISIBLE_DEVICES", None) + + def get_gpus_info(self, selection=None): + gpus = dict() + for device in self.devices: + if (selection is None) or (selection and str(device) in selection): + gpus[device] = self.get_gpu_info(device) + + return gpus + + def close(self): + pass + + +@contextmanager +def assume_gpu(ngpu=1, capacity=80000, enabled=False): + if enabled: + old = voirgpu.DEVICESMI + voirgpu.DEVICESMI = MockDeviceSMI(ngpu, capacity) + yield + voirgpu.DEVICESMI = old + else: + yield + + +class BashGenerator: + def __init__(self) -> None: + self.indent = 0 + self.background_mode = False + self.print("#!/bin/sh") + self.print("") + + def print(self, *args, **kwargs): + print(" " * self.indent, end="") + print(*args, **kwargs) + + def section(self, title): + self.echo("---") + self.echo(title) + self.echo("=" * len(title)) + + def echo(self, msg): + self.print(f'echo "{msg}"') + + def comment(self, cmt): + self.print(f"# {cmt}") + + def env(self, env): + for k, v in env.items(): + self.print(f"export {k}={shlex.quote(v)}") + self.print() + + @contextmanager + def subshell(self): + self.print("time (") + self.indent += 1 + yield + self.indent -= 1 + self.print(")") + + @contextmanager + def background(self): + self.background_mode = True + yield + self.print("wait") + self.background_mode = False + + def command(self, *args, env=None, **kwargs): + prefix = [] + if env is not None: + for k, v in env.items(): + prefix.append(f"{k}={v}") + + prefix = " ".join(prefix) + sufix = "" + if True: + sufix = "&" + + frags = [prefix] + [str(a) for a in args] + [sufix] + + self.print(" ".join(filter(lambda x: x != "", frags))) + + +# fmt: off +@dataclass +class Arguments: + nnodes: int = 2 + ngpu: int = 8 + capacity: int = 80000 + withenv: bool = True + usevoir: bool = False +# fmt: on + + +@tooled +def arguments(): + ngpu: Option & int = 8 + capacity: Option & int = 80000 + nnodes: Option & int = 2 + + # [negate] + withenv: Option & bool = True + + # [negate] + usevoir: Option & bool = True + return Arguments(nnodes, ngpu, capacity, withenv, usevoir) + + +@tooled +def multipack_args(conf: Arguments): + from ..common import arguments as multiargs + + args = multiargs() + args.system = "system_tmp.yaml" + + system = { + "system": { + "arch": "cuda", + "nodes": [ + { + "name": str(i), + "ip": f"192.168.0.{i + 10}" if i != 0 else "127.0.0.1", + "user": "username", + "main": i == 0, + "port": 22, + } + for i in range(conf.nnodes) + ], + } + } + + with open("system_tmp.yaml", "w") as file: + system = yaml.dump(system) + file.write(system) + + return args + + +@tooled +def cli_dry(args=None): + """Generate dry commands to execute the bench standalone""" + from ..commands import set_voir + from ..system import set_offline + from ..sizer import resolve_argv, scale_argv + + if args is None: + args = arguments() + + set_offline(True) + set_voir(args.usevoir) + + with assume_gpu(args.ngpu, args.capacity, enabled=True): + repeat = 1 + mp = get_multipack(multipack_args(args), run_name="dev") + gen = BashGenerator() + + first_pack = True + for index in range(repeat): + for pack in mp.packs.values(): + if first_pack and args.withenv: + first_pack = False + gen.section("Virtual Env") + + venv = pack.core._nox_session.env["VIRTUAL_ENV"] + gen.env({"VIRTUAL_ENV": venv}) + gen.print("source $VIRTUAL_ENV/bin/activate") + + gen.section("Milabench") + gen.env(pack.make_env()) + + exec_plan = make_execution_plan(pack, index, repeat) + + gen.section(pack.config["name"]) + with gen.subshell(): + with gen.background(): + for pack, argv, _ in exec_plan.commands(): + + sized_args = scale_argv(pack, argv) + final_args = resolve_argv(pack, sized_args) + + gen.command(*final_args, env=pack.config.get("env", {})) + + print() + + try: + os.remove("system_tmp.yaml") + except: + pass + + + +if __name__ == "__main__": + cli_dry() diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index a1c6da78a..c8c6d9448 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -460,16 +460,35 @@ def _argv(self, **kwargs): devices = self.pack.config.get("devices", []) nproc = len(devices) if nproc > 1: - argv = [*super()._argv(**kwargs), f"--nproc_per_node={nproc}", "--"] + argv = [*super()._argv(**kwargs), f"--nproc_per_node={nproc}"] + # Check if the sub-executor targets a module or not cmd = next(iter(self.exec.argv()), None) - # if the command exists and it is not a path assume it is a module - if cmd and not XPath(cmd).exists(): - argv.append("-m") + + if cmd: + # python or voir; tell it to not prepend python since we are doing it + if cmd in ("python", "voir"): + argv.append("--no-python") + + # if the command exists and it is not a path assume it is a module + # script is not a file, maybe it is a module + elif not XPath(cmd).exists(): + argv.append("-m") + + # everything after torchrun args are script args + argv.append("--") return argv return [] +use_voir = True + + +def set_voir(val): + global use_voir + use_voir = val + + class VoirCommand(WrapperCommand): """Execute an `Command` through voir @@ -491,6 +510,10 @@ def __init__(self, executor: SingleCmdCommand, *voir_argv, **kwargs) -> None: def _argv(self, **kwargs) -> List: argv = super()._argv(**kwargs) + if not use_voir: + # voir replace python + return ["python"] + if voirconf := self.pack.config.get("voir", None): hsh = md5(str(voirconf).encode("utf8")) voirconf_file = ( diff --git a/milabench/common.py b/milabench/common.py index eab593fbd..f08320f80 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -196,11 +196,14 @@ def is_selected(defn, args): ) def _get_multipack( - args: CommonArguments = arguments(), + args: CommonArguments = None, run_name=None, overrides={}, return_config=False, ): + if args is None: + args = arguments() + if args.config is None: args.config = os.environ.get("MILABENCH_CONFIG", None) diff --git a/milabench/metadata.py b/milabench/metadata.py index 392f01f12..a3bacacf3 100644 --- a/milabench/metadata.py +++ b/milabench/metadata.py @@ -62,7 +62,7 @@ def machine_metadata(pack=None): "machine": uname.machine, }, "accelerators": gpus, - "date": datetime.datetime.now(datetime.UTC).timestamp(), + "date": datetime.datetime.utcnow().timestamp(), "milabench": retrieve_git_versions( __tag__, __commit__, diff --git a/milabench/system.py b/milabench/system.py index 6edbba93d..520c9c526 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -167,6 +167,13 @@ def _resolve_ip(ip): try: hostname, aliaslist, ipaddrlist = socket.gethostbyaddr(ip) lazy_raise = None + + except socket.herror as err: + hostname = ip + aliaslist = [] + ipaddrlist = [] + lazy_raise = err + except socket.gaierror as err: # Get Addr Info (GAI) Error # @@ -195,6 +202,15 @@ def _fix_weird(hostname): return hostname +# If true that means we cannot resolve the ip addresses +# so we ignore errors +offline = False + +def set_offline(value): + global offline + offline = value + + def resolve_addresses(nodes): # Note: it is possible for self to be none # if we are running milabench on a node that is not part of the system @@ -229,8 +245,9 @@ def resolve_addresses(nodes): # if self is node we might be outisde the cluster # which explains why we could not resolve the IP of the nodes - if self is not None and lazy_raise: - raise RuntimeError("Could not resolve node ip") from lazy_raise + if not offline: + if self is not None and lazy_raise: + raise RuntimeError("Could not resolve node ip") from lazy_raise return self diff --git a/tests/conftest.py b/tests/conftest.py index f2a1ddca2..3d021c570 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,6 +25,19 @@ def get_config(name): return get_config + +@pytest.fixture +def official_config(): + def get_config(name): + return here / ".." / "config" / f"{name}.yaml" + return get_config + + +@pytest.fixture +def standard_config(official_config): + return official_config("standard") + + @pytest.fixture def replayfolder(): return here / "replays" diff --git a/tests/test_command_reg.py b/tests/test_command_reg.py new file mode 100644 index 000000000..f44b5a8db --- /dev/null +++ b/tests/test_command_reg.py @@ -0,0 +1,44 @@ +import os +from milabench.cli.dry import cli_dry, Arguments + + +ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + + +def compare(base, capsys, file_regression): + all = capsys.readouterr() + stdout = all.out + assert stdout != "" + + stdout = stdout.replace(str(base), "$BASE") + stdout = stdout.replace(str(ROOT), "$SRC") + file_regression.check(stdout) + + + +def test_command_reg_one_node(standard_config, tmp_path, capsys, file_regression): + os.environ["MILABENCH_CONFIG"] = str(standard_config) + os.environ["MILABENCH_BASE"] = str(tmp_path) + + args = Arguments() + args.ngpu = 8 + args.capacity = 80000 + args.nnodes = 1 + + cli_dry(args) + + compare(str(tmp_path), capsys, file_regression) + + +def test_command_reg_two_nodes(standard_config, tmp_path, capsys, file_regression): + os.environ["MILABENCH_CONFIG"] = str(standard_config) + os.environ["MILABENCH_BASE"] = str(tmp_path) + + args = Arguments() + args.ngpu = 8 + args.capacity = 80000 + args.nnodes = 2 + + cli_dry(args) + + compare(str(tmp_path), capsys, file_regression) \ No newline at end of file diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt new file mode 100644 index 000000000..02950c9ea --- /dev/null +++ b/tests/test_command_reg/test_command_reg_one_node.txt @@ -0,0 +1,452 @@ +#!/bin/sh + +echo "---" +echo "Virtual Env" +echo "===========" +export VIRTUAL_ENV=$BASE/venv/torch + +source $VIRTUAL_ENV/bin/activate +echo "---" +echo "Milabench" +echo "=========" +export MILABENCH_DIR_BASE=$BASE +export MILABENCH_DIR_VENV=$BASE/venv/torch +export MILABENCH_DIR_DATA=$BASE/data +export MILABENCH_DIR_RUNS=$BASE/runs +export MILABENCH_DIR_EXTRA=$BASE/extra/llm +export MILABENCH_DIR_CACHE=$BASE/cache +export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "localhost", "aliaslist": [], "ipaddrlist": ["00:00:03:9b:fe:80:00:00:00:00:00:00:b8:ce:f6:03:00:0e:54:86", "fe80::3eec:efff:fe72:9f02%eno1np0", "00:00:00:00:00:00", "3c:ec:ef:72:9f:02", "10.20.10.13", "172.16.10.13", "::1", "fe80::bace:f603:e:5486%ibp65s0f0", "127.0.0.1"], "local": true}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "localhost", "aliaslist": [], "ipaddrlist": ["00:00:03:9b:fe:80:00:00:00:00:00:00:b8:ce:f6:03:00:0e:54:86", "fe80::3eec:efff:fe72:9f02%eno1np0", "00:00:00:00:00:00", "3c:ec:ef:72:9f:02", "10.20.10.13", "172.16.10.13", "::1", "fe80::bace:f603:e:5486%ibp65s0f0", "127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/tests/../config", "config_file": "$SRC/milabench/tests/../config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' +export OMP_NUM_THREADS=8 + +echo "---" +echo "llama" +echo "=====" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + wait +) + +echo "---" +echo "fp16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + wait +) + +echo "---" +echo "bf16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + wait +) + +echo "---" +echo "tf32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + wait +) + +echo "---" +echo "fp32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + wait +) + +echo "---" +echo "resnet50" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet50-noio" +echo "=============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet152-ddp" +echo "=============" +time ( + $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "convnext_large-fp32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-fp16" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32-fp16" +echo "========================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "regnet_y_128gf" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + wait +) + +echo "---" +echo "bert-fp32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-fp16" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32-fp16" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "t5" +echo "==" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + wait +) + +echo "---" +echo "reformer" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + wait +) + +echo "---" +echo "whisper" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + wait +) + +echo "---" +echo "resnet152" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "resnet152-multi" +echo "===============" +time ( + $BASE/venv/torch/bin/torchrun --nproc_per_node=8 --no-python -- python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-multi.0 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "davit_large" +echo "===========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "davit_large-multi" +echo "=================" +time ( + $BASE/venv/torch/bin/torchrun --nproc_per_node=8 --no-python -- python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-multi.0 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "focalnet" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "opt-1_3b" +echo "========" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-1.3b --cache $BASE/cache & + wait +) + +echo "---" +echo "opt-1_3b-multinode" +echo "==================" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-1.3b --cache $BASE/cache & + wait +) + +echo "---" +echo "opt-6_7b" +echo "========" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-6.7b --cache $BASE/cache & + wait +) + +echo "---" +echo "opt-6_7b-multinode" +echo "==================" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-6.7b --cache $BASE/cache & + wait +) + +echo "---" +echo "stargan" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + wait +) + +echo "---" +echo "super-slomo" +echo "===========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + wait +) + +echo "---" +echo "dlrm" +echo "====" +time ( + python $SRC/milabench/benchmarks/dlrm/dlrm/dlrm_s_pytorch.py --num-batches 1000 --data-generation random --arch-mlp-bot 512-512-64 --arch-mlp-top 1024-1024-1024-1 --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup 100 --arch-interaction-op dot --numpy-rand-seed 727 --print-freq 999999 --mini-batch-size 16384 --test-mini-batch-size 16384 --test-num-workers 0 --use-gpu --num-workers 8 & + wait +) + +echo "---" +echo "brax" +echo "====" +time ( + python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & + wait +) + diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt new file mode 100644 index 000000000..92942b264 --- /dev/null +++ b/tests/test_command_reg/test_command_reg_two_nodes.txt @@ -0,0 +1,454 @@ +#!/bin/sh + +echo "---" +echo "Virtual Env" +echo "===========" +export VIRTUAL_ENV=$BASE/venv/torch + +source $VIRTUAL_ENV/bin/activate +echo "---" +echo "Milabench" +echo "=========" +export MILABENCH_DIR_BASE=$BASE +export MILABENCH_DIR_VENV=$BASE/venv/torch +export MILABENCH_DIR_DATA=$BASE/data +export MILABENCH_DIR_RUNS=$BASE/runs +export MILABENCH_DIR_EXTRA=$BASE/extra/llm +export MILABENCH_DIR_CACHE=$BASE/cache +export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "localhost", "aliaslist": [], "ipaddrlist": ["00:00:03:9b:fe:80:00:00:00:00:00:00:b8:ce:f6:03:00:0e:54:86", "fe80::3eec:efff:fe72:9f02%eno1np0", "00:00:00:00:00:00", "3c:ec:ef:72:9f:02", "10.20.10.13", "172.16.10.13", "::1", "fe80::bace:f603:e:5486%ibp65s0f0", "127.0.0.1"], "local": true}, {"ip": "192.168.0.11", "main": false, "name": "1", "port": 22, "user": "username", "hostname": "192.168.0.11", "aliaslist": [], "ipaddrlist": [], "local": false}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "localhost", "aliaslist": [], "ipaddrlist": ["00:00:03:9b:fe:80:00:00:00:00:00:00:b8:ce:f6:03:00:0e:54:86", "fe80::3eec:efff:fe72:9f02%eno1np0", "00:00:00:00:00:00", "3c:ec:ef:72:9f:02", "10.20.10.13", "172.16.10.13", "::1", "fe80::bace:f603:e:5486%ibp65s0f0", "127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/tests/../config", "config_file": "$SRC/milabench/tests/../config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' +export OMP_NUM_THREADS=8 + +echo "---" +echo "llama" +echo "=====" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + wait +) + +echo "---" +echo "fp16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + wait +) + +echo "---" +echo "bf16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + wait +) + +echo "---" +echo "tf32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + wait +) + +echo "---" +echo "fp32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + wait +) + +echo "---" +echo "resnet50" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet50-noio" +echo "=============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet152-ddp" +echo "=============" +time ( + $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "convnext_large-fp32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-fp16" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32-fp16" +echo "========================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "regnet_y_128gf" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + wait +) + +echo "---" +echo "bert-fp32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-fp16" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32-fp16" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "t5" +echo "==" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + wait +) + +echo "---" +echo "reformer" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & + wait +) + +echo "---" +echo "whisper" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + wait +) + +echo "---" +echo "resnet152" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "resnet152-multi" +echo "===============" +time ( + $BASE/venv/torch/bin/torchrun --nproc_per_node=8 --no-python -- python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model resnet152 --batch-size 256 --output $BASE/extra/timm/dev/resnet152-multi.0 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "davit_large" +echo "===========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "davit_large-multi" +echo "=================" +time ( + $BASE/venv/torch/bin/torchrun --nproc_per_node=8 --no-python -- python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model davit_large --batch-size 128 --lr-base 0.01 --output $BASE/extra/timm/dev/davit_large-multi.0 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "focalnet" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "opt-1_3b" +echo "========" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-1.3b --cache $BASE/cache & + wait +) + +echo "---" +echo "opt-1_3b-multinode" +echo "==================" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-1.3b --cache $BASE/cache & + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-1.3b --cache $BASE/cache & + wait +) + +echo "---" +echo "opt-6_7b" +echo "========" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-6.7b --cache $BASE/cache & + wait +) + +echo "---" +echo "opt-6_7b-multinode" +echo "==================" +time ( + $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-6.7b --cache $BASE/cache & + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/benchmarks/accelerate_opt/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --use_deepspeed --deepspeed_multinode_launcher=standard --zero_stage=2 --gradient_accumulation_steps=1 --num_cpu_threads_per_process=8 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 $SRC/milabench/benchmarks/accelerate_opt/main.py --max_train_steps 100 --dataset_name wikitext --dataset_config_name wikitext-103-v1 --dataset_rev b08601e --validation_split_percentage 5 --per_gpu_batch_size 1 --cpus_per_gpu 8 --model_name facebook/opt-6.7b --cache $BASE/cache & + wait +) + +echo "---" +echo "stargan" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/stargan/stargan/main.py --image_size 512 --c_dim 5 --batch_size 16 --dataset synth --celeba_image_dir $BASE/data --log_dir $BASE/extra/stargan/logs --model_save_dir $BASE/extra/stargan/models --sample_dir $BASE/extra/stargan/samples --result_dir $BASE/extra/stargan/results --num_workers 8 & + wait +) + +echo "---" +echo "super-slomo" +echo "===========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 & + wait +) + +echo "---" +echo "dlrm" +echo "====" +time ( + python $SRC/milabench/benchmarks/dlrm/dlrm/dlrm_s_pytorch.py --num-batches 1000 --data-generation random --arch-mlp-bot 512-512-64 --arch-mlp-top 1024-1024-1024-1 --arch-sparse-feature-size 64 --arch-embedding-size 1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 --num-indices-per-lookup 100 --arch-interaction-op dot --numpy-rand-seed 727 --print-freq 999999 --mini-batch-size 16384 --test-mini-batch-size 16384 --test-num-workers 0 --use-gpu --num-workers 8 & + wait +) + +echo "---" +echo "brax" +echo "====" +time ( + python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & + wait +) +