feat: extend integration tests for experiments (#105)

* feat: extend integration tests - make create_experiment test parametrizable - refactor into utils and tests * fix: remove kubeflow namespace from experiment * feat: test delete experiment * feat: add examples to cover all images and reduce trial counts * fix: change container images in examples to v0.15.0 * feat: check experiment is running or succeeded * feat: deploy experiments in user namespace with profiles operator --------- Signed-off-by: Phoevos Kalemkeris <[email protected]> Co-authored-by: Phoevos Kalemkeris <[email protected]>
canonical · Aug 17, 2023 · b3b6981 · b3b6981
1 parent 4e4fffc
commit b3b6981
Show file tree

Hide file tree

Showing 15 changed files with 913 additions and 115 deletions.
diff --git a/.github/workflows/integrate.yaml b/.github/workflows/integrate.yaml
@@ -94,6 +94,20 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
+    # Ideally we'd use self-hosted runners, but this effort is still not stable.
+    # This action will remove unused software (dotnet, haskell, android libs, codeql,
+    # and docker images) from the GH runner, which will liberate around 60 GB of storage
+    # distributed in 40GB for root and around 20 for a mnt point.
+      - name: Maximise GH runner space
+        uses: easimon/maximize-build-space@v7
+        with:
+          root-reserve-mb: 40960
+          remove-dotnet: 'true'
+          remove-haskell: 'true'
+          remove-android: 'true'
+          remove-codeql: 'true'
+          remove-docker-images: 'true'
+
       - name: Check out code
         uses: actions/checkout@v3
       - name: Setup operator environment
@@ -102,7 +116,7 @@ jobs:
           provider: microk8s
           channel: 1.24/stable
           juju-channel: 2.9/stable
-          microk8s-addons: "dns storage rbac metallb:10.64.140.43-10.64.140.49"
+          microk8s-addons: "dns storage rbac"
 
       - name: Run test
         run: |

diff --git a/tests/assets/crs/experiments/bayesian-optimization.yaml b/tests/assets/crs/experiments/bayesian-optimization.yaml
@@ -0,0 +1,75 @@
+# Source: katib/examples/v1beta1/hp-tuning/bayesian-optimization.yaml
+# This example is slightly modified from upstream to consume less resources. 
+# There's a `modified` comment where we diverge from upstream.
+# When updating this file, make sure to keep those modifications.
+---
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  name: bayesian-optimization
+spec:
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: Validation-accuracy
+    additionalMetricNames:
+      - Train-accuracy
+  algorithm:
+    algorithmName: bayesianoptimization
+    algorithmSettings:
+      - name: "random_state"
+        value: "10"
+  parallelTrialCount: 1  # modified
+  maxTrialCount: 1  # modified
+  maxFailedTrialCount: 1  # modified
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.01"
+        max: "0.03"
+    - name: num-layers
+      parameterType: int
+      feasibleSpace:
+        min: "2"
+        max: "5"
+    - name: optimizer
+      parameterType: categorical
+      feasibleSpace:
+        list:
+          - sgd
+          - adam
+          - ftrl
+  trialTemplate:
+    primaryContainerName: training-container
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+      - name: numberLayers
+        description: Number of training model layers
+        reference: num-layers
+      - name: optimizer
+        description: Training model optimizer (sdg, adam or ftrl)
+        reference: optimizer
+    trialSpec:
+      apiVersion: batch/v1
+      kind: Job
+      spec:
+        template:
+          spec:
+            containers:
+              - name: training-container
+                image: docker.io/kubeflowkatib/mxnet-mnist:v0.15.0
+                command:
+                  - "python3"
+                  - "/opt/mxnet-mnist/mnist.py"
+                  - "--batch-size=64"
+                  - "--lr=${trialParameters.learningRate}"
+                  - "--num-layers=${trialParameters.numberLayers}"
+                  - "--optimizer=${trialParameters.optimizer}"
+                resources:  # modified
+                  limits:  # modified
+                    memory: "2Gi"  # modified
+                    cpu: "1"  # modified
+            restartPolicy: Never
diff --git a/tests/assets/crs/experiments/cmaes.yaml b/tests/assets/crs/experiments/cmaes.yaml
@@ -0,0 +1,75 @@
+# Source: katib/examples/v1beta1/hp-tuning/cma-es.yaml
+# This example is slightly modified from upstream to consume less resources.
+# There's a `modified` comment where we diverge from upstream.
+# When updating this file, make sure to keep those modifications.
+---
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  name: cmaes
+spec:
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: Validation-accuracy
+    additionalMetricNames:
+      - Train-accuracy
+  algorithm:
+    algorithmName: cmaes
+    algorithmSettings:
+      - name: "restart_strategy"
+        value: "ipop"
+  parallelTrialCount: 1  # modified
+  maxTrialCount: 1  # modified
+  maxFailedTrialCount: 1  # modified
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.01"
+        max: "0.03"
+    - name: num-layers
+      parameterType: int
+      feasibleSpace:
+        min: "2"
+        max: "5"
+    - name: optimizer
+      parameterType: categorical
+      feasibleSpace:
+        list:
+          - sgd
+          - adam
+          - ftrl
+  trialTemplate:
+    primaryContainerName: training-container
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+      - name: numberLayers
+        description: Number of training model layers
+        reference: num-layers
+      - name: optimizer
+        description: Training model optimizer (sdg, adam or ftrl)
+        reference: optimizer
+    trialSpec:
+      apiVersion: batch/v1
+      kind: Job
+      spec:
+        template:
+          spec:
+            containers:
+              - name: training-container
+                image: docker.io/kubeflowkatib/mxnet-mnist:v0.15.0
+                command:
+                  - "python3"
+                  - "/opt/mxnet-mnist/mnist.py"
+                  - "--batch-size=64"
+                  - "--lr=${trialParameters.learningRate}"
+                  - "--num-layers=${trialParameters.numberLayers}"
+                  - "--optimizer=${trialParameters.optimizer}"
+                resources:  # modified
+                  limits:  # modified
+                    memory: "2Gi"  # modified
+                    cpu: "1"  # modified
+            restartPolicy: Never
diff --git a/tests/assets/crs/experiments/darts-cpu.yaml b/tests/assets/crs/experiments/darts-cpu.yaml
@@ -0,0 +1,77 @@
+# Source: katib/examples/v1beta1/nas/darts-cpu.yaml
+# This example is slightly modified from upstream to consume less resources.
+# There's a `modified` comment where we diverge from upstream.
+# When updating this file, make sure to keep those modifications.
+---
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  name: darts-cpu
+spec:
+  parallelTrialCount: 1
+  maxTrialCount: 1
+  maxFailedTrialCount: 1
+  objective:
+    type: maximize
+    objectiveMetricName: Best-Genotype
+  metricsCollectorSpec:
+    collector:
+      kind: StdOut
+    source:
+      filter:
+        metricsFormat:
+          - "([\\w-]+)=(Genotype.*)"
+  algorithm:
+    algorithmName: darts
+    algorithmSettings:
+      - name: num_epochs
+        value: "1"
+      - name: num_nodes
+        value: "1"
+      - name: init_channels
+        value: "1"
+      - name: stem_multiplier
+        value: "1"
+  nasConfig:
+    graphConfig:
+      numLayers: 1
+    operations:
+      - operationType: max_pooling
+        parameters:
+          - name: filter_size
+            parameterType: categorical
+            feasibleSpace:
+              list:
+                - "3"
+  trialTemplate:
+    primaryContainerName: training-container
+    trialParameters:
+      - name: algorithmSettings
+        description: Algorithm settings of DARTS Experiment
+        reference: algorithm-settings
+      - name: searchSpace
+        description: Search Space of DARTS Experiment
+        reference: search-space
+      - name: numberLayers
+        description: Number of Neural Network layers
+        reference: num-layers
+    trialSpec:
+      apiVersion: batch/v1
+      kind: Job
+      spec:
+        template:
+          spec:
+            containers:
+              - name: training-container
+                image: docker.io/kubeflowkatib/darts-cnn-cifar10-cpu:v0.15.0
+                command:
+                  - python3
+                  - run_trial.py
+                  - --algorithm-settings="${trialParameters.algorithmSettings}"
+                  - --search-space="${trialParameters.searchSpace}"
+                  - --num-layers="${trialParameters.numberLayers}"
+                resources:  # modified
+                  limits:  # modified
+                    memory: "2Gi"  # modified
+                    cpu: "1"  # modified
+            restartPolicy: Never