kubeflow · google-oss-prow · Mar 4, 2024 · Mar 2, 2024 · Mar 2, 2024
diff --git a/.github/workflows/e2e-test-mxnet-mnist.yaml b/.github/workflows/e2e-test-mxnet-mnist.yaml
diff --git a/.github/workflows/e2e-test-pytorch-mnist.yaml b/.github/workflows/e2e-test-pytorch-mnist.yaml
@@ -37,5 +37,9 @@ jobs:
         kubernetes-version: ["v1.25.12", "v1.26.6", "v1.27.3"]
         # Comma Delimited
         experiments:
+          # suggestion-hyperopt
+          - "long-running-resume,from-volume-resume,median-stop"
+          # others
+          - "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband"
           - "file-metrics-collector,pytorchjob-mnist"
           - "median-stop-with-json-format,file-metrics-collector-with-json-format"
diff --git a/.github/workflows/e2e-test-ui-random-search-postgres.yaml b/.github/workflows/e2e-test-ui-random-search-postgres.yaml
@@ -25,7 +25,7 @@ jobs:
         with:
           experiments: random
           # Comma Delimited
-          trial-images: mxnet-mnist
+          trial-images: pytorch-mnist-cpu
           katib-ui: true
           database-type: postgres
 

diff --git a/.github/workflows/publish-trial-images.yaml b/.github/workflows/publish-trial-images.yaml
@@ -22,9 +22,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - trial-name: mxnet-mnist
-            platforms: linux/amd64,linux/arm64
-            dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
           - trial-name: pytorch-mnist-cpu
             platforms: linux/amd64,linux/arm64
             dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu

diff --git a/docs/images-location.md b/docs/images-location.md
@@ -238,17 +238,6 @@ The following table shows images for training containers which are used in the
         <b>Location</b>
       </td>
     </tr>
-    <tr align="center">
-      <td>
-        <code>docker.io/kubeflowkatib/mxnet-mnist</code>
-      </td>
-      <td>
-        MXNet MNIST example with collecting metrics time
-      </td>
-      <td>
-        <a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/mxnet-mnist/Dockerfile">Dockerfile</a>
-      </td>
-    </tr>
     <tr align="center">
       <td>
         <code>docker.io/kubeflowkatib/pytorch-mnist-cpu</code>

diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md
@@ -104,8 +104,6 @@ Check the following images for the Trial containers:
 
 - [Tensorflow MNIST with summaries](./trial-images/tf-mnist-with-summaries)
 
-- [MXNet MNIST](./trial-images/mxnet-mnist)
-
 - [PyTorch MNIST](./trial-images/pytorch-mnist)
 
 - [ENAS Keras CNN CIFAR-10](./trial-images/enas-cnn-cifar10)

diff --git a/examples/v1beta1/argo/argo-workflow.yaml b/examples/v1beta1/argo/argo-workflow.yaml
@@ -13,11 +13,9 @@ metadata:
   name: katib-argo-workflow
 spec:
   objective:
-    type: maximize
-    goal: 0.99
-    objectiveMetricName: Validation-accuracy
-    additionalMetricNames:
-      - Train-accuracy
+    type: minimize
+    goal: 0.001
+    objectiveMetricName: loss
   algorithm:
     algorithmName: random
   parallelTrialCount: 2
@@ -50,35 +48,36 @@ spec:
           - name: hp-workflow
             steps:
               - - name: data-preprocessing
-                  template: gen-num-examples
+                  template: gen-epochs
               - - name: model-training
                   template: model-training
                   arguments:
                     parameters:
-                      - name: num-examples
+                      - name: epochs
                         value: "{{steps.data-preprocessing.outputs.result}}"
 
-          - name: gen-num-examples
+          - name: gen-epochs
             script:
               image: python:alpine3.6
               command:
                 - python
               source: |
                 import random
-                print(60000//random.randint(10, 100))
+                print(60000//random.randint(3000, 30000))
 
           - name: model-training
             metadata:
               labels:
                 katib.kubeflow.org/model-training: "true"
             inputs:
               parameters:
-                - name: num-examples
+                - name: epochs
             container:
               name: model-training
-              image: docker.io/kubeflowkatib/mxnet-mnist:latest
+              image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
               command:
                 - "python3"
-                - "/opt/mxnet-mnist/mnist.py"
+                - "/opt/pytorch-mnist/mnist.py"
                 - "--lr=${trialParameters.learningRate}"
-                - "--num-examples={{inputs.parameters.num-examples}}"
+                - "--epochs={{inputs.parameters.epochs}}"
+                - "--batch-size=16"
diff --git a/examples/v1beta1/early-stopping/median-stop.yaml b/examples/v1beta1/early-stopping/median-stop.yaml
@@ -8,11 +8,9 @@ metadata:
   name: median-stop
 spec:
   objective:
-    type: maximize
-    goal: 0.99
-    objectiveMetricName: Validation-accuracy
-    additionalMetricNames:
-      - Train-accuracy
+    type: minimize
+    goal: 0.001
+    objectiveMetricName: loss
   algorithm:
     algorithmName: random
   earlyStopping:
@@ -30,22 +28,22 @@ spec:
       parameterType: double
       feasibleSpace:
         min: "0.01"
-        max: "0.5"
-    - name: num-epochs
-      parameterType: int
+        max: "0.05"
+    - name: momentum
+      parameterType: double
       feasibleSpace:
-        min: "3"
-        max: "4"
+        min: "0.5"
+        max: "0.9"
   trialTemplate:
     retain: true
     primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model
         reference: lr
-      - name: numberEpochs
-        description: Number of epochs to train the model
-        reference: num-epochs
+      - name: momentum
+        description: Momentum for the training model
+        reference: momentum
     trialSpec:
       apiVersion: batch/v1
       kind: Job
@@ -54,11 +52,12 @@ spec:
           spec:
             containers:
               - name: training-container
-                image: docker.io/kubeflowkatib/mxnet-mnist:latest
+                image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
                 command:
                   - "python3"
-                  - "/opt/mxnet-mnist/mnist.py"
-                  - "--batch-size=64"
+                  - "/opt/pytorch-mnist/mnist.py"
+                  - "--epochs=1"
+                  - "--batch-size=16"
                   - "--lr=${trialParameters.learningRate}"
-                  - "--num-epochs=${trialParameters.numberEpochs}"
+                  - "--momentum=${trialParameters.momentum}"
             restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/bayesian-optimization.yaml b/examples/v1beta1/hp-tuning/bayesian-optimization.yaml
@@ -6,11 +6,9 @@ metadata:
   name: bayesian-optimization
 spec:
   objective:
-    type: maximize
-    goal: 0.99
-    objectiveMetricName: Validation-accuracy
-    additionalMetricNames:
-      - Train-accuracy
+    type: minimize
+    goal: 0.001
+    objectiveMetricName: loss
   algorithm:
     algorithmName: bayesianoptimization
     algorithmSettings:
@@ -24,31 +22,21 @@ spec:
       parameterType: double
       feasibleSpace:
         min: "0.01"
-        max: "0.03"
-    - name: num-layers
-      parameterType: int
-      feasibleSpace:
-        min: "2"
-        max: "5"
-    - name: optimizer
-      parameterType: categorical
+        max: "0.05"
+    - name: momentum
+      parameterType: double
       feasibleSpace:
-        list:
-          - sgd
-          - adam
-          - ftrl
+        min: "0.5"
+        max: "0.9"
   trialTemplate:
     primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model
         reference: lr
-      - name: numberLayers
-        description: Number of training model layers
-        reference: num-layers
-      - name: optimizer
-        description: Training model optimizer (sdg, adam or ftrl)
-        reference: optimizer
+      - name: momentum
+        description: Momentum for the training model
+        reference: momentum
     trialSpec:
       apiVersion: batch/v1
       kind: Job
@@ -57,12 +45,12 @@ spec:
           spec:
             containers:
               - name: training-container
-                image: docker.io/kubeflowkatib/mxnet-mnist:latest
+                image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
                 command:
                   - "python3"
-                  - "/opt/mxnet-mnist/mnist.py"
-                  - "--batch-size=64"
+                  - "/opt/pytorch-mnist/mnist.py"
+                  - "--epochs=1"
+                  - "--batch-size=16"
                   - "--lr=${trialParameters.learningRate}"
-                  - "--num-layers=${trialParameters.numberLayers}"
-                  - "--optimizer=${trialParameters.optimizer}"
+                  - "--momentum=${trialParameters.momentum}"
             restartPolicy: Never
diff --git a/examples/v1beta1/hp-tuning/cma-es.yaml b/examples/v1beta1/hp-tuning/cma-es.yaml
@@ -6,11 +6,9 @@ metadata:
   name: cmaes
 spec:
   objective:
-    type: maximize
-    goal: 0.99
-    objectiveMetricName: Validation-accuracy
-    additionalMetricNames:
-      - Train-accuracy
+    type: minimize
+    goal: 0.001
+    objectiveMetricName: loss
   algorithm:
     algorithmName: cmaes
     algorithmSettings:
@@ -24,31 +22,21 @@ spec:
       parameterType: double
       feasibleSpace:
         min: "0.01"
-        max: "0.03"
-    - name: num-layers
-      parameterType: int
-      feasibleSpace:
-        min: "2"
-        max: "5"
-    - name: optimizer
-      parameterType: categorical
+        max: "0.05"
+    - name: momentum
+      parameterType: double
       feasibleSpace:
-        list:
-          - sgd
-          - adam
-          - ftrl
+        min: "0.5"
+        max: "0.9"
   trialTemplate:
     primaryContainerName: training-container
     trialParameters:
       - name: learningRate
         description: Learning rate for the training model
         reference: lr
-      - name: numberLayers
-        description: Number of training model layers
-        reference: num-layers
-      - name: optimizer
-        description: Training model optimizer (sdg, adam or ftrl)
-        reference: optimizer
+      - name: momentum
+        description: Momentum for the training model
+        reference: momentum
     trialSpec:
       apiVersion: batch/v1
       kind: Job
@@ -57,12 +45,12 @@ spec:
           spec:
             containers:
               - name: training-container
-                image: docker.io/kubeflowkatib/mxnet-mnist:latest
+                image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
                 command:
                   - "python3"
-                  - "/opt/mxnet-mnist/mnist.py"
-                  - "--batch-size=64"
+                  - "/opt/pytorch-mnist/mnist.py"
+                  - "--epochs=1"
+                  - "--batch-size=16"
                   - "--lr=${trialParameters.learningRate}"
-                  - "--num-layers=${trialParameters.numberLayers}"
-                  - "--optimizer=${trialParameters.optimizer}"
+                  - "--momentum=${trialParameters.momentum}"
             restartPolicy: Never