feat: add TFEvent experiment to integration tests (#127)

Closes #103
canonical · Aug 29, 2023 · d12d568 · d12d568
1 parent b00817e
commit d12d568
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 1 deletion.
diff --git a/tests/assets/crs/experiments/tfjob-mnist-with-summaries.yaml b/tests/assets/crs/experiments/tfjob-mnist-with-summaries.yaml
@@ -0,0 +1,73 @@
+# Source: katib/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml
+# This example is slightly modified from upstream to consume less resources.
+# There's a `modified` comment where we diverge from upstream.
+# When updating this file, make sure to keep those modifications.
+---
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  name: tfjob-mnist-with-summaries
+spec:
+  parallelTrialCount: 1  # modified
+  maxTrialCount: 1  # modified
+  maxFailedTrialCount: 1  # modified
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: accuracy
+  algorithm:
+    algorithmName: random
+  metricsCollectorSpec:
+    source:
+      fileSystemPath:
+        path: /mnist-with-summaries-logs/test
+        kind: Directory
+    collector:
+      kind: TensorFlowEvent
+  parameters:
+    - name: learning_rate
+      parameterType: double
+      feasibleSpace:
+        min: "0.01"
+        max: "0.05"
+    - name: batch_size
+      parameterType: int
+      feasibleSpace:
+        min: "32"
+        max: "64"
+  trialTemplate:
+    primaryContainerName: tensorflow
+    # In this example we can collect metrics only from the Worker pods.
+    primaryPodLabels:
+      training.kubeflow.org/replica-type: worker
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: learning_rate
+      - name: batchSize
+        description: Batch Size
+        reference: batch_size
+    trialSpec:
+      apiVersion: kubeflow.org/v1
+      kind: TFJob
+      spec:
+        tfReplicaSpecs:
+          Worker:
+            replicas: 2
+            restartPolicy: OnFailure
+            template:
+              spec:
+                containers:
+                  - name: tensorflow
+                    image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest
+                    command:
+                      - "python"
+                      - "/opt/tf-mnist-with-summaries/mnist.py"
+                      - "--epochs=1"
+                      - "--learning-rate=${trialParameters.learningRate}"
+                      - "--batch-size=${trialParameters.batchSize}"
+                      - "--log-path=/mnist-with-summaries-logs"
+                    resources:  # modified
+                      limits:  # modified
+                        memory: "2Gi"  # modified
+                        cpu: "1"  # modified
diff --git a/tests/integration/test_katib_experiments.py b/tests/integration/test_katib_experiments.py
@@ -27,6 +27,7 @@
     kind="Profile",
     plural="profiles",
 )
+TRAINING_CHARM = "training-operator"
 
 
 @pytest.fixture(scope="module")
@@ -37,6 +38,19 @@ def lightkube_client() -> lightkube.Client:
     return client
 
 
+@pytest.fixture(scope="module")
+async def training_operator(ops_test: OpsTest):
+    """Deploy training-operator charm, and wait until it's active."""
+    await ops_test.model.deploy(
+        entity_url=TRAINING_CHARM,
+        channel="latest/edge",
+        trust=True,
+    )
+    await ops_test.model.wait_for_idle(
+        apps=[TRAINING_CHARM], status="active", raise_on_blocked=False, timeout=60 * 5
+    )
+
+
 @pytest.fixture(scope="module")
 def create_profile(lightkube_client):
     """Create Profile and handle cleanup at the end of the module tests."""
@@ -60,12 +74,13 @@ def create_profile(lightkube_client):
     glob.glob("tests/assets/crs/experiments/*.yaml"),
 )
 async def test_katib_experiments(
-    create_profile, lightkube_client, ops_test: OpsTest, experiment_file
+    create_profile, lightkube_client, training_operator, ops_test: OpsTest, experiment_file
 ):
     """Test Katib experiments.
 
     Create an experiment and assert that it is Running or Succeeded. Delete the experiment after it
     has completed.
+    Uses `training-operator` fixture needed to run the tfjob-mnist-with-summaries.yaml example.
     NOTE: This test is re-using the deployment created in test_charms::test_deploy_katib_charms().
     """
     exp_name = create_experiment(