From 49d7188aa85b519ecf1f9ae0bf988d5ad2de36bd Mon Sep 17 00:00:00 2001 From: user name Date: Tue, 24 Sep 2024 16:31:59 +0100 Subject: [PATCH] Updates envs and configs for MLPerf tests on Cirrus --- tests/mlperf/cosmoflow/gpu.py | 45 +++++++++++++++++-------- tests/mlperf/deepcam/gpu.py | 41 +++++++++++++++-------- tests/mlperf/resnet50/gpu.py | 54 +++++++++++++++++++----------- tests/mlperf/resnet50/graphcore.py | 2 +- 4 files changed, 94 insertions(+), 48 deletions(-) diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py index 4518b6a..56d38a1 100644 --- a/tests/mlperf/cosmoflow/gpu.py +++ b/tests/mlperf/cosmoflow/gpu.py @@ -17,7 +17,9 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck): num_tasks = None num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7)) - lbs = parameter([8]) + # Due to memory, Cirrus is limited to a lbs of 2 + lbs = parameter([2]) + time_limit = "1h" num_nodes = 1 @@ -25,17 +27,6 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck): @run_after("init") def setup_systems(self): """Setup environment""" - self.executable_opts = [ - "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py", - "--config", - "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml", - "--device", - "cuda", - "-lbs", - f"{self.lbs}", - # "--t_subset_size", "2048", - # "--v_subset_size", "512" - ] if self.current_system.name in ["archer2"]: self.executable = "" self.extra_resources = { @@ -52,6 +43,18 @@ def setup_systems(self): "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD", "HOME": "$PWD", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml", + "--device", + "cuda", + "-lbs", + "8", + # "--t_subset_size", "2048", + # "--v_subset_size", "512" + ] + elif self.current_system.name in ["cirrus"]: self.executable = "python" @@ -60,14 +63,28 @@ def setup_systems(self): } self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ - 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', - "conda activate mlperf_torch", + 'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"', + "conda activate torch_mlperf", ] self.env_vars = { "OMP_NUM_THREADS": "5", "SRUN_CPUS_PER_TASK": "5", "OMPI_MCA_mpi_warn_on_fork": "0", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml", + "--device", + "cuda", + "--data-dir", + "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini " + "-lbs", + "2", + # "--t_subset_size", "2048", + # "--v_subset_size", "512" + ] + @run_before("run") def set_task_distribution(self): diff --git a/tests/mlperf/deepcam/gpu.py b/tests/mlperf/deepcam/gpu.py index 30ceecd..39a917a 100644 --- a/tests/mlperf/deepcam/gpu.py +++ b/tests/mlperf/deepcam/gpu.py @@ -17,7 +17,8 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck): num_tasks = None num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7)) - lbs = parameter([8]) + # Due to memory, Cirrus is limited to a lbs of 2 + # lbs = parameter([2]) time_limit = "1h" num_nodes = 1 @@ -25,17 +26,6 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck): @run_after("init") def setup_systems(self): """Setup environment""" - self.executable_opts = [ - "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py", - "--config", - "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml", - "--device", - "cuda", - "-lbs", - f"{self.lbs}", - # "--t_subset_size", "1024", - # "--v_subset_size", "512" - ] if self.current_system.name in ["archer2"]: self.executable = "" self.extra_resources = { @@ -52,6 +42,18 @@ def setup_systems(self): "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD", "HOME": "$PWD", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "8", + # "--t_subset_size", "1024", + # "--v_subset_size", "512" + ] + elif self.current_system.name in ["cirrus"]: self.executable = "python" @@ -60,14 +62,25 @@ def setup_systems(self): } self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ - 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', - "conda activate mlperf_torch", + 'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"', + "conda activate torch_mlperf", ] self.env_vars = { "OMP_NUM_THREADS": "5", "SRUN_CPUS_PER_TASK": "5", "OMPI_MCA_mpi_warn_on_fork": "0", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "2", + # "--t_subset_size", "1024", + # "--v_subset_size", "512" + ] @run_before("run") def set_task_distribution(self): diff --git a/tests/mlperf/resnet50/gpu.py b/tests/mlperf/resnet50/gpu.py index 21ba687..7e64bbd 100644 --- a/tests/mlperf/resnet50/gpu.py +++ b/tests/mlperf/resnet50/gpu.py @@ -17,7 +17,8 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck): num_tasks = None num_gpus = parameter([4]) - lbs = parameter([8]) + # Due to memory, Cirrus is limited to a lbs of 2 + # lbs = parameter([8]) time_limit = "1h" num_nodes = 1 @@ -25,19 +26,6 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck): @run_after("init") def setup_systems(self): """Environment setup""" - self.executable_opts = [ - "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py", - "--config", - "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml", - "--device", - "cuda", - "-lbs", - f"{self.lbs}", - "--t_subset_size", - "2048", - "--v_subset_size", - "512", - ] if self.current_system.name in ["archer2"]: self.executable = "" self.extra_resources = { @@ -54,25 +42,53 @@ def setup_systems(self): "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD", "HOME": "$PWD", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "8", + "--t_subset_size", + "2048", + "--v_subset_size", + "512", + ] + elif self.current_system.name in ["cirrus"]: - self.executable_opts[2] = ( - "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml", - ) + # self.executable_opts[2] = ( + # "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml", + # ) self.executable = "python" self.extra_resources = { "qos": {"qos": "gpu"}, } self.modules = ["openmpi/4.1.6-cuda-11.6"] self.prerun_cmds = [ - 'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"', - "conda activate mlperf_torch", + 'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"', + "conda activate torch_mlperf", ] self.env_vars = { "OMP_NUM_THREADS": "5", "SRUN_CPUS_PER_TASK": "5", "OMPI_MCA_mpi_warn_on_fork": "0", } + self.executable_opts = [ + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py", + "--config", + "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml", + "--device", + "cuda", + "-lbs", + "2", + "--t_subset_size", + "2048", + "--v_subset_size", + "512", + ] + @run_before("run") def set_task_distribution(self): diff --git a/tests/mlperf/resnet50/graphcore.py b/tests/mlperf/resnet50/graphcore.py index a77b62a..58fc8a0 100644 --- a/tests/mlperf/resnet50/graphcore.py +++ b/tests/mlperf/resnet50/graphcore.py @@ -8,7 +8,7 @@ @rfm.simple_test -class ResNetGPUServiceBenchmark(ResNet50BaseCheck): +class ResNetGPUServiceGraphCoreBenchmark(ResNet50BaseCheck): """Resnet50 test class for graphcore""" valid_prog_environs = ["*"]