Skip to content

Commit

Permalink
feat: add TFEvent experiment to integration tests (#127)
Browse files Browse the repository at this point in the history
Closes #103
  • Loading branch information
NohaIhab committed Aug 29, 2023
1 parent b00817e commit d12d568
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 1 deletion.
73 changes: 73 additions & 0 deletions tests/assets/crs/experiments/tfjob-mnist-with-summaries.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Source: katib/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml
# This example is slightly modified from upstream to consume less resources.
# There's a `modified` comment where we diverge from upstream.
# When updating this file, make sure to keep those modifications.
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
name: tfjob-mnist-with-summaries
spec:
parallelTrialCount: 1 # modified
maxTrialCount: 1 # modified
maxFailedTrialCount: 1 # modified
objective:
type: maximize
goal: 0.99
objectiveMetricName: accuracy
algorithm:
algorithmName: random
metricsCollectorSpec:
source:
fileSystemPath:
path: /mnist-with-summaries-logs/test
kind: Directory
collector:
kind: TensorFlowEvent
parameters:
- name: learning_rate
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.05"
- name: batch_size
parameterType: int
feasibleSpace:
min: "32"
max: "64"
trialTemplate:
primaryContainerName: tensorflow
# In this example we can collect metrics only from the Worker pods.
primaryPodLabels:
training.kubeflow.org/replica-type: worker
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: learning_rate
- name: batchSize
description: Batch Size
reference: batch_size
trialSpec:
apiVersion: kubeflow.org/v1
kind: TFJob
spec:
tfReplicaSpecs:
Worker:
replicas: 2
restartPolicy: OnFailure
template:
spec:
containers:
- name: tensorflow
image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest
command:
- "python"
- "/opt/tf-mnist-with-summaries/mnist.py"
- "--epochs=1"
- "--learning-rate=${trialParameters.learningRate}"
- "--batch-size=${trialParameters.batchSize}"
- "--log-path=/mnist-with-summaries-logs"
resources: # modified
limits: # modified
memory: "2Gi" # modified
cpu: "1" # modified
17 changes: 16 additions & 1 deletion tests/integration/test_katib_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
kind="Profile",
plural="profiles",
)
TRAINING_CHARM = "training-operator"


@pytest.fixture(scope="module")
Expand All @@ -37,6 +38,19 @@ def lightkube_client() -> lightkube.Client:
return client


@pytest.fixture(scope="module")
async def training_operator(ops_test: OpsTest):
"""Deploy training-operator charm, and wait until it's active."""
await ops_test.model.deploy(
entity_url=TRAINING_CHARM,
channel="latest/edge",
trust=True,
)
await ops_test.model.wait_for_idle(
apps=[TRAINING_CHARM], status="active", raise_on_blocked=False, timeout=60 * 5
)


@pytest.fixture(scope="module")
def create_profile(lightkube_client):
"""Create Profile and handle cleanup at the end of the module tests."""
Expand All @@ -60,12 +74,13 @@ def create_profile(lightkube_client):
glob.glob("tests/assets/crs/experiments/*.yaml"),
)
async def test_katib_experiments(
create_profile, lightkube_client, ops_test: OpsTest, experiment_file
create_profile, lightkube_client, training_operator, ops_test: OpsTest, experiment_file
):
"""Test Katib experiments.
Create an experiment and assert that it is Running or Succeeded. Delete the experiment after it
has completed.
Uses `training-operator` fixture needed to run the tfjob-mnist-with-summaries.yaml example.
NOTE: This test is re-using the deployment created in test_charms::test_deploy_katib_charms().
"""
exp_name = create_experiment(
Expand Down

0 comments on commit d12d568

Please sign in to comment.