Skip to content

Commit

Permalink
Updates envs and configs for MLPerf tests on Cirrus
Browse files Browse the repository at this point in the history
  • Loading branch information
user name committed Sep 24, 2024
1 parent 058ba46 commit 49d7188
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 48 deletions.
45 changes: 31 additions & 14 deletions tests/mlperf/cosmoflow/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,16 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck):

num_tasks = None
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
lbs = parameter([2])


time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Setup environment"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
Expand All @@ -52,6 +43,18 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]


elif self.current_system.name in ["cirrus"]:
self.executable = "python"
Expand All @@ -60,14 +63,28 @@ def setup_systems(self):
}
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml",
"--device",
"cuda",
"--data-dir",
"/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini "
"-lbs",
"2",
# "--t_subset_size", "2048",
# "--v_subset_size", "512"
]


@run_before("run")
def set_task_distribution(self):
Expand Down
41 changes: 27 additions & 14 deletions tests/mlperf/deepcam/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,15 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck):

num_tasks = None
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
# lbs = parameter([2])

time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Setup environment"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
Expand All @@ -52,6 +42,18 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]


elif self.current_system.name in ["cirrus"]:
self.executable = "python"
Expand All @@ -60,14 +62,25 @@ def setup_systems(self):
}
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"2",
# "--t_subset_size", "1024",
# "--v_subset_size", "512"
]

@run_before("run")
def set_task_distribution(self):
Expand Down
54 changes: 35 additions & 19 deletions tests/mlperf/resnet50/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,15 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck):

num_tasks = None
num_gpus = parameter([4])
lbs = parameter([8])
# Due to memory, Cirrus is limited to a lbs of 2
# lbs = parameter([8])

time_limit = "1h"
num_nodes = 1

@run_after("init")
def setup_systems(self):
"""Environment setup"""
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
f"{self.lbs}",
"--t_subset_size",
"2048",
"--v_subset_size",
"512",
]
if self.current_system.name in ["archer2"]:
self.executable = ""
self.extra_resources = {
Expand All @@ -54,25 +42,53 @@ def setup_systems(self):
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
"HOME": "$PWD",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"8",
"--t_subset_size",
"2048",
"--v_subset_size",
"512",
]


elif self.current_system.name in ["cirrus"]:
self.executable_opts[2] = (
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
)
# self.executable_opts[2] = (
# "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
# )
self.executable = "python"
self.extra_resources = {
"qos": {"qos": "gpu"},
}
self.modules = ["openmpi/4.1.6-cuda-11.6"]
self.prerun_cmds = [
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
"conda activate mlperf_torch",
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
"conda activate torch_mlperf",
]
self.env_vars = {
"OMP_NUM_THREADS": "5",
"SRUN_CPUS_PER_TASK": "5",
"OMPI_MCA_mpi_warn_on_fork": "0",
}
self.executable_opts = [
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
"--config",
"/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
"--device",
"cuda",
"-lbs",
"2",
"--t_subset_size",
"2048",
"--v_subset_size",
"512",
]


@run_before("run")
def set_task_distribution(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/mlperf/resnet50/graphcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


@rfm.simple_test
class ResNetGPUServiceBenchmark(ResNet50BaseCheck):
class ResNetGPUServiceGraphCoreBenchmark(ResNet50BaseCheck):
"""Resnet50 test class for graphcore"""

valid_prog_environs = ["*"]
Expand Down

0 comments on commit 49d7188

Please sign in to comment.