diff --git a/tests/assets/crs/experiments/tfjob-mnist-with-summaries.yaml b/tests/assets/crs/experiments/tfjob-mnist-with-summaries.yaml new file mode 100644 index 00000000..050d97c9 --- /dev/null +++ b/tests/assets/crs/experiments/tfjob-mnist-with-summaries.yaml @@ -0,0 +1,73 @@ +# Source: katib/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml +# This example is slightly modified from upstream to consume less resources. +# There's a `modified` comment where we diverge from upstream. +# When updating this file, make sure to keep those modifications. +--- +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + name: tfjob-mnist-with-summaries +spec: + parallelTrialCount: 1 # modified + maxTrialCount: 1 # modified + maxFailedTrialCount: 1 # modified + objective: + type: maximize + goal: 0.99 + objectiveMetricName: accuracy + algorithm: + algorithmName: random + metricsCollectorSpec: + source: + fileSystemPath: + path: /mnist-with-summaries-logs/test + kind: Directory + collector: + kind: TensorFlowEvent + parameters: + - name: learning_rate + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.05" + - name: batch_size + parameterType: int + feasibleSpace: + min: "32" + max: "64" + trialTemplate: + primaryContainerName: tensorflow + # In this example we can collect metrics only from the Worker pods. + primaryPodLabels: + training.kubeflow.org/replica-type: worker + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: learning_rate + - name: batchSize + description: Batch Size + reference: batch_size + trialSpec: + apiVersion: kubeflow.org/v1 + kind: TFJob + spec: + tfReplicaSpecs: + Worker: + replicas: 2 + restartPolicy: OnFailure + template: + spec: + containers: + - name: tensorflow + image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest + command: + - "python" + - "/opt/tf-mnist-with-summaries/mnist.py" + - "--epochs=1" + - "--learning-rate=${trialParameters.learningRate}" + - "--batch-size=${trialParameters.batchSize}" + - "--log-path=/mnist-with-summaries-logs" + resources: # modified + limits: # modified + memory: "2Gi" # modified + cpu: "1" # modified diff --git a/tests/integration/test_katib_experiments.py b/tests/integration/test_katib_experiments.py index 9d1e9e4d..05e5200e 100644 --- a/tests/integration/test_katib_experiments.py +++ b/tests/integration/test_katib_experiments.py @@ -27,6 +27,7 @@ kind="Profile", plural="profiles", ) +TRAINING_CHARM = "training-operator" @pytest.fixture(scope="module") @@ -37,6 +38,19 @@ def lightkube_client() -> lightkube.Client: return client +@pytest.fixture(scope="module") +async def training_operator(ops_test: OpsTest): + """Deploy training-operator charm, and wait until it's active.""" + await ops_test.model.deploy( + entity_url=TRAINING_CHARM, + channel="latest/edge", + trust=True, + ) + await ops_test.model.wait_for_idle( + apps=[TRAINING_CHARM], status="active", raise_on_blocked=False, timeout=60 * 5 + ) + + @pytest.fixture(scope="module") def create_profile(lightkube_client): """Create Profile and handle cleanup at the end of the module tests.""" @@ -60,12 +74,13 @@ def create_profile(lightkube_client): glob.glob("tests/assets/crs/experiments/*.yaml"), ) async def test_katib_experiments( - create_profile, lightkube_client, ops_test: OpsTest, experiment_file + create_profile, lightkube_client, training_operator, ops_test: OpsTest, experiment_file ): """Test Katib experiments. Create an experiment and assert that it is Running or Succeeded. Delete the experiment after it has completed. + Uses `training-operator` fixture needed to run the tfjob-mnist-with-summaries.yaml example. NOTE: This test is re-using the deployment created in test_charms::test_deploy_katib_charms(). """ exp_name = create_experiment(