Skip to content

Commit

Permalink
feat: extend integration tests for experiments (#105)
Browse files Browse the repository at this point in the history
* feat: extend integration tests
- make create_experiment test parametrizable
- refactor into utils and tests
* fix: remove kubeflow namespace from experiment
* feat: test delete experiment
* feat: add examples to cover all images and reduce trial counts
* fix: change container images in examples to v0.15.0
* feat: check experiment is running or succeeded
* feat: deploy experiments in user namespace with profiles operator
---------

Signed-off-by: Phoevos Kalemkeris <[email protected]>
Co-authored-by: Phoevos Kalemkeris <[email protected]>
  • Loading branch information
NohaIhab and phoevos committed Aug 17, 2023
1 parent 4e4fffc commit b3b6981
Show file tree
Hide file tree
Showing 15 changed files with 913 additions and 115 deletions.
16 changes: 15 additions & 1 deletion .github/workflows/integrate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,20 @@ jobs:
runs-on: ubuntu-20.04

steps:
# Ideally we'd use self-hosted runners, but this effort is still not stable.
# This action will remove unused software (dotnet, haskell, android libs, codeql,
# and docker images) from the GH runner, which will liberate around 60 GB of storage
# distributed in 40GB for root and around 20 for a mnt point.
- name: Maximise GH runner space
uses: easimon/maximize-build-space@v7
with:
root-reserve-mb: 40960
remove-dotnet: 'true'
remove-haskell: 'true'
remove-android: 'true'
remove-codeql: 'true'
remove-docker-images: 'true'

- name: Check out code
uses: actions/checkout@v3
- name: Setup operator environment
Expand All @@ -102,7 +116,7 @@ jobs:
provider: microk8s
channel: 1.24/stable
juju-channel: 2.9/stable
microk8s-addons: "dns storage rbac metallb:10.64.140.43-10.64.140.49"
microk8s-addons: "dns storage rbac"

- name: Run test
run: |
Expand Down
75 changes: 75 additions & 0 deletions tests/assets/crs/experiments/bayesian-optimization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Source: katib/examples/v1beta1/hp-tuning/bayesian-optimization.yaml
# This example is slightly modified from upstream to consume less resources.
# There's a `modified` comment where we diverge from upstream.
# When updating this file, make sure to keep those modifications.
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
name: bayesian-optimization
spec:
objective:
type: maximize
goal: 0.99
objectiveMetricName: Validation-accuracy
additionalMetricNames:
- Train-accuracy
algorithm:
algorithmName: bayesianoptimization
algorithmSettings:
- name: "random_state"
value: "10"
parallelTrialCount: 1 # modified
maxTrialCount: 1 # modified
maxFailedTrialCount: 1 # modified
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: num-layers
parameterType: int
feasibleSpace:
min: "2"
max: "5"
- name: optimizer
parameterType: categorical
feasibleSpace:
list:
- sgd
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: numberLayers
description: Number of training model layers
reference: num-layers
- name: optimizer
description: Training model optimizer (sdg, adam or ftrl)
reference: optimizer
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:v0.15.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
- "--lr=${trialParameters.learningRate}"
- "--num-layers=${trialParameters.numberLayers}"
- "--optimizer=${trialParameters.optimizer}"
resources: # modified
limits: # modified
memory: "2Gi" # modified
cpu: "1" # modified
restartPolicy: Never
75 changes: 75 additions & 0 deletions tests/assets/crs/experiments/cmaes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Source: katib/examples/v1beta1/hp-tuning/cma-es.yaml
# This example is slightly modified from upstream to consume less resources.
# There's a `modified` comment where we diverge from upstream.
# When updating this file, make sure to keep those modifications.
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
name: cmaes
spec:
objective:
type: maximize
goal: 0.99
objectiveMetricName: Validation-accuracy
additionalMetricNames:
- Train-accuracy
algorithm:
algorithmName: cmaes
algorithmSettings:
- name: "restart_strategy"
value: "ipop"
parallelTrialCount: 1 # modified
maxTrialCount: 1 # modified
maxFailedTrialCount: 1 # modified
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: num-layers
parameterType: int
feasibleSpace:
min: "2"
max: "5"
- name: optimizer
parameterType: categorical
feasibleSpace:
list:
- sgd
- adam
- ftrl
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: numberLayers
description: Number of training model layers
reference: num-layers
- name: optimizer
description: Training model optimizer (sdg, adam or ftrl)
reference: optimizer
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist:v0.15.0
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
- "--lr=${trialParameters.learningRate}"
- "--num-layers=${trialParameters.numberLayers}"
- "--optimizer=${trialParameters.optimizer}"
resources: # modified
limits: # modified
memory: "2Gi" # modified
cpu: "1" # modified
restartPolicy: Never
77 changes: 77 additions & 0 deletions tests/assets/crs/experiments/darts-cpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Source: katib/examples/v1beta1/nas/darts-cpu.yaml
# This example is slightly modified from upstream to consume less resources.
# There's a `modified` comment where we diverge from upstream.
# When updating this file, make sure to keep those modifications.
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
name: darts-cpu
spec:
parallelTrialCount: 1
maxTrialCount: 1
maxFailedTrialCount: 1
objective:
type: maximize
objectiveMetricName: Best-Genotype
metricsCollectorSpec:
collector:
kind: StdOut
source:
filter:
metricsFormat:
- "([\\w-]+)=(Genotype.*)"
algorithm:
algorithmName: darts
algorithmSettings:
- name: num_epochs
value: "1"
- name: num_nodes
value: "1"
- name: init_channels
value: "1"
- name: stem_multiplier
value: "1"
nasConfig:
graphConfig:
numLayers: 1
operations:
- operationType: max_pooling
parameters:
- name: filter_size
parameterType: categorical
feasibleSpace:
list:
- "3"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: algorithmSettings
description: Algorithm settings of DARTS Experiment
reference: algorithm-settings
- name: searchSpace
description: Search Space of DARTS Experiment
reference: search-space
- name: numberLayers
description: Number of Neural Network layers
reference: num-layers
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/darts-cnn-cifar10-cpu:v0.15.0
command:
- python3
- run_trial.py
- --algorithm-settings="${trialParameters.algorithmSettings}"
- --search-space="${trialParameters.searchSpace}"
- --num-layers="${trialParameters.numberLayers}"
resources: # modified
limits: # modified
memory: "2Gi" # modified
cpu: "1" # modified
restartPolicy: Never
Loading

0 comments on commit b3b6981

Please sign in to comment.