From 49d7188aa85b519ecf1f9ae0bf988d5ad2de36bd Mon Sep 17 00:00:00 2001
From: user name <ebroadwa@cirrus-login3.ib0.icexa.epcc.ed.ac.uk>
Date: Tue, 24 Sep 2024 16:31:59 +0100
Subject: [PATCH] Updates envs and configs for MLPerf tests on Cirrus

---
 tests/mlperf/cosmoflow/gpu.py      | 45 +++++++++++++++++--------
 tests/mlperf/deepcam/gpu.py        | 41 +++++++++++++++--------
 tests/mlperf/resnet50/gpu.py       | 54 +++++++++++++++++++-----------
 tests/mlperf/resnet50/graphcore.py |  2 +-
 4 files changed, 94 insertions(+), 48 deletions(-)

diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py
index 4518b6a..56d38a1 100644
--- a/tests/mlperf/cosmoflow/gpu.py
+++ b/tests/mlperf/cosmoflow/gpu.py
@@ -17,7 +17,9 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck):
 
     num_tasks = None
     num_gpus = parameter([4])  # parameter(1 << pow for pow in range(7))
-    lbs = parameter([8])
+    # Due to memory, Cirrus is limited to a lbs of 2
+    lbs = parameter([2])
+
 
     time_limit = "1h"
     num_nodes = 1
@@ -25,17 +27,6 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck):
     @run_after("init")
     def setup_systems(self):
         """Setup environment"""
-        self.executable_opts = [
-            "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
-            "--config",
-            "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
-            "--device",
-            "cuda",
-            "-lbs",
-            f"{self.lbs}",
-            # "--t_subset_size", "2048",
-            # "--v_subset_size", "512"
-        ]
         if self.current_system.name in ["archer2"]:
             self.executable = ""
             self.extra_resources = {
@@ -52,6 +43,18 @@ def setup_systems(self):
                 "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
                 "HOME": "$PWD",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "8",
+                # "--t_subset_size", "2048",
+                # "--v_subset_size", "512"
+            ]
+
 
         elif self.current_system.name in ["cirrus"]:
             self.executable = "python"
@@ -60,14 +63,28 @@ def setup_systems(self):
             }
             self.modules = ["openmpi/4.1.6-cuda-11.6"]
             self.prerun_cmds = [
-                'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
-                "conda activate mlperf_torch",
+                'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
+                "conda activate torch_mlperf",
             ]
             self.env_vars = {
                 "OMP_NUM_THREADS": "5",
                 "SRUN_CPUS_PER_TASK": "5",
                 "OMPI_MCA_mpi_warn_on_fork": "0",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml",
+                "--device",
+                "cuda",
+                "--data-dir", 
+                "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini "
+                "-lbs",
+                "2",
+                # "--t_subset_size", "2048",
+                # "--v_subset_size", "512"
+            ]
+
 
     @run_before("run")
     def set_task_distribution(self):
diff --git a/tests/mlperf/deepcam/gpu.py b/tests/mlperf/deepcam/gpu.py
index 30ceecd..39a917a 100644
--- a/tests/mlperf/deepcam/gpu.py
+++ b/tests/mlperf/deepcam/gpu.py
@@ -17,7 +17,8 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck):
 
     num_tasks = None
     num_gpus = parameter([4])  # parameter(1 << pow for pow in range(7))
-    lbs = parameter([8])
+    # Due to memory, Cirrus is limited to a lbs of 2
+    # lbs = parameter([2])
 
     time_limit = "1h"
     num_nodes = 1
@@ -25,17 +26,6 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck):
     @run_after("init")
     def setup_systems(self):
         """Setup environment"""
-        self.executable_opts = [
-            "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
-            "--config",
-            "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
-            "--device",
-            "cuda",
-            "-lbs",
-            f"{self.lbs}",
-            # "--t_subset_size", "1024",
-            # "--v_subset_size", "512"
-        ]
         if self.current_system.name in ["archer2"]:
             self.executable = ""
             self.extra_resources = {
@@ -52,6 +42,18 @@ def setup_systems(self):
                 "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
                 "HOME": "$PWD",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "8",
+                # "--t_subset_size", "1024",
+                # "--v_subset_size", "512"
+            ]
+
 
         elif self.current_system.name in ["cirrus"]:
             self.executable = "python"
@@ -60,14 +62,25 @@ def setup_systems(self):
             }
             self.modules = ["openmpi/4.1.6-cuda-11.6"]
             self.prerun_cmds = [
-                'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
-                "conda activate mlperf_torch",
+                'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
+                "conda activate torch_mlperf",
             ]
             self.env_vars = {
                 "OMP_NUM_THREADS": "5",
                 "SRUN_CPUS_PER_TASK": "5",
                 "OMPI_MCA_mpi_warn_on_fork": "0",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "2",
+                # "--t_subset_size", "1024",
+                # "--v_subset_size", "512"
+            ]
 
     @run_before("run")
     def set_task_distribution(self):
diff --git a/tests/mlperf/resnet50/gpu.py b/tests/mlperf/resnet50/gpu.py
index 21ba687..7e64bbd 100644
--- a/tests/mlperf/resnet50/gpu.py
+++ b/tests/mlperf/resnet50/gpu.py
@@ -17,7 +17,8 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck):
 
     num_tasks = None
     num_gpus = parameter([4])
-    lbs = parameter([8])
+    # Due to memory, Cirrus is limited to a lbs of 2
+    # lbs = parameter([8])
 
     time_limit = "1h"
     num_nodes = 1
@@ -25,19 +26,6 @@ class ResNet50GPUBenchmark(ResNet50BaseCheck):
     @run_after("init")
     def setup_systems(self):
         """Environment setup"""
-        self.executable_opts = [
-            "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
-            "--config",
-            "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml",
-            "--device",
-            "cuda",
-            "-lbs",
-            f"{self.lbs}",
-            "--t_subset_size",
-            "2048",
-            "--v_subset_size",
-            "512",
-        ]
         if self.current_system.name in ["archer2"]:
             self.executable = ""
             self.extra_resources = {
@@ -54,25 +42,53 @@ def setup_systems(self):
                 "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
                 "HOME": "$PWD",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/archer2benchmark_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "8",
+                "--t_subset_size",
+                "2048",
+                "--v_subset_size",
+                "512",
+            ]
+
 
         elif self.current_system.name in ["cirrus"]:
-            self.executable_opts[2] = (
-                "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
-            )
+            # self.executable_opts[2] = (
+            #     "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
+            # )
             self.executable = "python"
             self.extra_resources = {
                 "qos": {"qos": "gpu"},
             }
             self.modules = ["openmpi/4.1.6-cuda-11.6"]
             self.prerun_cmds = [
-                'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
-                "conda activate mlperf_torch",
+                'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
+                "conda activate torch_mlperf",
             ]
             self.env_vars = {
                 "OMP_NUM_THREADS": "5",
                 "SRUN_CPUS_PER_TASK": "5",
                 "OMPI_MCA_mpi_warn_on_fork": "0",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML/ResNet50/Torch/configs/cirrusbenchmark_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "2",
+                "--t_subset_size",
+                "2048",
+                "--v_subset_size",
+                "512",
+            ]
+
 
     @run_before("run")
     def set_task_distribution(self):
diff --git a/tests/mlperf/resnet50/graphcore.py b/tests/mlperf/resnet50/graphcore.py
index a77b62a..58fc8a0 100644
--- a/tests/mlperf/resnet50/graphcore.py
+++ b/tests/mlperf/resnet50/graphcore.py
@@ -8,7 +8,7 @@
 
 
 @rfm.simple_test
-class ResNetGPUServiceBenchmark(ResNet50BaseCheck):
+class ResNetGPUServiceGraphCoreBenchmark(ResNet50BaseCheck):
     """Resnet50 test class for graphcore"""
 
     valid_prog_environs = ["*"]