From d8f63ea5d3479d86fd85abea7da58ec0a6fd6a2c Mon Sep 17 00:00:00 2001 From: Noha Ihab <49988746+NohaIhab@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:41:13 +0300 Subject: [PATCH 1/5] feat: configure the UATs to run behind a proxy using Notebooks (#98) * add poddefault and configure kfp tests * kfp: use dict.get to avoid KeyError * configure katib uats * configure training uats * add testing with proxy instructions to README --- README.md | 49 + tests/notebooks/katib/katib-integration.ipynb | 399 +--- .../notebooks/kfp_v2/kfp-v2-integration.ipynb | 201 +- .../training/training-integration.ipynb | 1813 +++++++++-------- tests/proxy-poddefault.yaml | 22 + 5 files changed, 1137 insertions(+), 1347 deletions(-) create mode 100644 tests/proxy-poddefault.yaml diff --git a/README.md b/README.md index 97af1e9..837f93e 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,55 @@ tox -e kubeflow-remote tox -e kubeflow-local ``` +### Run behind proxy +#### Prerequistes +**To run the tests behind proxy using Notebook or using the driver, the following step is necessary:** + +Edit the PodDefault `tests/proxy-poddefault.yaml` to replace the placeholders for: + * `:`: The address and port of your proxy server + * ``: you can get this value by running: + ``` + cat /var/snap/microk8s/current/args/kube-proxy | grep cluster-cidr + ``` + * ``: you can get this value by running: + ``` + cat /var/snap/microk8s/current/args/kube-apiserver | grep service-cluster-ip-range + ``` + + * ``: the Internal IP of the nodes where your cluster is running, you can + get this value by running: + ``` + microk8s kubectl get nodes -o wide + ``` + It is the `INTERNAL-IP` value + * ``: the name of your host on which the cluster is deployed, you can use the + `hostname` command to get it + +#### Running using Notebook +To run the tests behind proxy using Notebook: +1. Login to the Dashboard and Create a Profile +2. Apply the PodDefault to your Profile's namespace, make sure you already followed the Prerequisites + section to modify the PodDefault. Apply it with: + ``` + kubectl apply -f ./tests/proxy-poddefault.yaml -n + ``` +3. Create a Notebook and from the `Advanced Options > Configurations` select `Add proxy settings`, + then click `Launch` to start the Notebook. + Wait for the Notebook to be Ready, then Connect to it. +4. From inside the Notebook, start a new terminal session and clone this repo: + + ```bash + git clone https://github.com/canonical/charmed-kubeflow-uats.git + ``` + Open the `charmed-kubeflow-uats/tests` directory and for each `.ipynb` test file there, open it + and run the Notebook. + + Currently, the following tests are supported to run behind proxy: + * katib + * kserve + * kfp_v2 + * training (except TFJob due to https://github.com/canonical/training-operator/issues/182) + #### Developer Notes Any environment that can be used to access and configure the Charmed Kubeflow deployment is diff --git a/tests/notebooks/katib/katib-integration.ipynb b/tests/notebooks/katib/katib-integration.ipynb index f803073..beb6467 100644 --- a/tests/notebooks/katib/katib-integration.ipynb +++ b/tests/notebooks/katib/katib-integration.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "tags": [ "pytest-skip" @@ -46,12 +46,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ + "import os\n", + "\n", "from kubeflow.katib import (\n", " KatibClient,\n", " V1beta1AlgorithmSpec,\n", @@ -79,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "tags": [] }, @@ -99,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "tags": [] }, @@ -110,7 +112,19 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def proxy_envs_set():\n", + " if os.environ.get('HTTP_PROXY') and os.environ.get('HTTPS_PROXY') and os.environ.get('NO_PROXY'):\n", + " return True\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "tags": [] }, @@ -184,6 +198,9 @@ " }\n", "}\n", "\n", + "if proxy_envs_set():\n", + " trial_spec['spec']['template']['metadata']['labels']={\"notebook-proxy\": \"true\"}\n", + "\n", "trial_template=V1beta1TrialTemplate(\n", " primary_container_name=\"training-container\",\n", " trial_parameters=[\n", @@ -226,28 +243,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: cmaes-example\n", - "Algorithm: cmaes\n", - "Objective: loss\n", - "Trial Parameters:\n", - "- learningRate: Learning rate for the training model\n", - "- momentum: Momentum for the training model\n", - "Max Trial Count: 3\n", - "Max Failed Trial Count: 1\n", - "Parallel Trial Count: 2\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Name:\", experiment.metadata.name)\n", "print(\"Algorithm:\", experiment.spec.algorithm.algorithm_name)\n", @@ -271,22 +272,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "[exp.metadata.name for exp in client.list_experiments()]" ] @@ -302,31 +292,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment user/cmaes-example has been created\n" - ] - }, - { - "data": { - "text/html": [ - "Katib Experiment cmaes-example link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "client.create_experiment(experiment)" ] @@ -343,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "tags": [] }, @@ -361,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "tags": [ "raises-exception" @@ -379,132 +349,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment: cmaes-example\n", - "\n", - "Experiment Spec:\n", - "{'algorithm': {'algorithm_name': 'cmaes', 'algorithm_settings': None},\n", - " 'early_stopping': None,\n", - " 'max_failed_trial_count': 1,\n", - " 'max_trial_count': 3,\n", - " 'metrics_collector_spec': {'collector': {'custom_collector': None,\n", - " 'kind': 'StdOut'},\n", - " 'source': None},\n", - " 'nas_config': None,\n", - " 'objective': {'additional_metric_names': ['Train-accuracy'],\n", - " 'goal': 0.001,\n", - " 'metric_strategies': [{'name': 'loss', 'value': 'min'},\n", - " {'name': 'Train-accuracy',\n", - " 'value': 'min'}],\n", - " 'objective_metric_name': 'loss',\n", - " 'type': 'minimize'},\n", - " 'parallel_trial_count': 2,\n", - " 'parameters': [{'feasible_space': {'list': None,\n", - " 'max': '0.06',\n", - " 'min': '0.01',\n", - " 'step': None},\n", - " 'name': 'lr',\n", - " 'parameter_type': 'double'},\n", - " {'feasible_space': {'list': None,\n", - " 'max': '0.9',\n", - " 'min': '0.5',\n", - " 'step': None},\n", - " 'name': 'momentum',\n", - " 'parameter_type': 'double'}],\n", - " 'resume_policy': 'Never',\n", - " 'trial_template': {'config_map': None,\n", - " 'failure_condition': 'status.conditions.#(type==\"Failed\")#|#(status==\"True\")#',\n", - " 'primary_container_name': 'training-container',\n", - " 'primary_pod_labels': None,\n", - " 'retain': None,\n", - " 'success_condition': 'status.conditions.#(type==\"Complete\")#|#(status==\"True\")#',\n", - " 'trial_parameters': [{'description': 'Learning rate for '\n", - " 'the training model',\n", - " 'name': 'learningRate',\n", - " 'reference': 'lr'},\n", - " {'description': 'Momentum for the '\n", - " 'training model',\n", - " 'name': 'momentum',\n", - " 'reference': 'momentum'}],\n", - " 'trial_spec': {'apiVersion': 'batch/v1',\n", - " 'kind': 'Job',\n", - " 'spec': {'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", - " 'spec': {'containers': [{'command': ['python3',\n", - " '/opt/pytorch-mnist/mnist.py',\n", - " '--epochs=1',\n", - " '--batch-size=64',\n", - " '--lr=${trialParameters.learningRate}',\n", - " '--momentum=${trialParameters.momentum}'],\n", - " 'image': 'docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0',\n", - " 'name': 'training-container'}],\n", - " 'restartPolicy': 'Never'}}}}}}\n", - "\n", - "Experiment Status:\n", - "{'completion_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'conditions': [{'last_transition_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is created',\n", - " 'reason': 'ExperimentCreated',\n", - " 'status': 'True',\n", - " 'type': 'Created'},\n", - " {'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is running',\n", - " 'reason': 'ExperimentRunning',\n", - " 'status': 'False',\n", - " 'type': 'Running'},\n", - " {'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': 'Experiment has succeeded because max trial count '\n", - " 'has reached',\n", - " 'reason': 'ExperimentMaxTrialsReached',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}],\n", - " 'current_optimal_trial': {'best_trial_name': 'cmaes-example-dphxbch7',\n", - " 'observation': {'metrics': [{'latest': '0.3130',\n", - " 'max': '2.2980',\n", - " 'min': '0.2691',\n", - " 'name': 'loss'},\n", - " {'latest': 'unavailable',\n", - " 'max': 'unavailable',\n", - " 'min': 'unavailable',\n", - " 'name': 'Train-accuracy'}]},\n", - " 'parameter_assignments': [{'name': 'lr',\n", - " 'value': '0.04511033252270099'},\n", - " {'name': 'momentum',\n", - " 'value': '0.6980954001565728'}]},\n", - " 'early_stopped_trial_list': None,\n", - " 'failed_trial_list': None,\n", - " 'killed_trial_list': None,\n", - " 'last_reconcile_time': None,\n", - " 'metrics_unavailable_trial_list': None,\n", - " 'pending_trial_list': None,\n", - " 'running_trial_list': None,\n", - " 'start_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'succeeded_trial_list': ['cmaes-example-9pjzlnzc',\n", - " 'cmaes-example-dphxbch7',\n", - " 'cmaes-example-7zhq4s49'],\n", - " 'trial_metrics_unavailable': None,\n", - " 'trials': 3,\n", - " 'trials_early_stopped': None,\n", - " 'trials_failed': None,\n", - " 'trials_killed': None,\n", - " 'trials_pending': None,\n", - " 'trials_running': None,\n", - " 'trials_succeeded': 3}\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "exp = client.get_experiment(name=EXPERIMENT_NAME)\n", "print(\"Experiment:\", exp.metadata.name, end=\"\\n\\n\")\n", @@ -523,34 +373,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'last_transition_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is created',\n", - " 'reason': 'ExperimentCreated',\n", - " 'status': 'True',\n", - " 'type': 'Created'}, {'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is running',\n", - " 'reason': 'ExperimentRunning',\n", - " 'status': 'False',\n", - " 'type': 'Running'}, {'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': 'Experiment has succeeded because max trial count has reached',\n", - " 'reason': 'ExperimentMaxTrialsReached',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}]\n" - ] - } - ], + "outputs": [], "source": [ "conditions = client.get_experiment_conditions(name=EXPERIMENT_NAME)\n", "print(conditions)" @@ -558,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "tags": [ "raises-exception" @@ -581,32 +408,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'best_trial_name': 'cmaes-example-dphxbch7',\n", - " 'observation': {'metrics': [{'latest': '0.3130',\n", - " 'max': '2.2980',\n", - " 'min': '0.2691',\n", - " 'name': 'loss'},\n", - " {'latest': 'unavailable',\n", - " 'max': 'unavailable',\n", - " 'min': 'unavailable',\n", - " 'name': 'Train-accuracy'}]},\n", - " 'parameter_assignments': [{'name': 'lr', 'value': '0.04511033252270099'},\n", - " {'name': 'momentum', 'value': '0.6980954001565728'}]}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "client.get_optimal_hyperparameters(name=EXPERIMENT_NAME)" ] @@ -622,46 +428,12 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "scrolled": true, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trial: cmaes-example-dphxbch7\n", - "Trial Status:\n", - "{'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 25, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 25, tzinfo=tzlocal()),\n", - " 'message': 'Trial has succeeded',\n", - " 'reason': 'TrialSucceeded',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}\n", - "\n", - "Trial: cmaes-example-9pjzlnzc\n", - "Trial Status:\n", - "{'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 27, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 27, tzinfo=tzlocal()),\n", - " 'message': 'Trial has succeeded',\n", - " 'reason': 'TrialSucceeded',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}\n", - "\n", - "Trial: cmaes-example-7zhq4s49\n", - "Trial Status:\n", - "{'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': 'Trial has succeeded',\n", - " 'reason': 'TrialSucceeded',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "trial_list = client.list_trials(experiment_name=EXPERIMENT_NAME)\n", "for trial in trial_list:\n", @@ -671,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "tags": [ "raises-exception" @@ -698,78 +470,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Suggestion: cmaes-example\n", - "\n", - "Suggestion Spec:\n", - "{'algorithm': {'algorithm_name': 'cmaes', 'algorithm_settings': None},\n", - " 'early_stopping': None,\n", - " 'requests': 3,\n", - " 'resume_policy': 'Never'}\n", - "\n", - "Suggestion Status:\n", - "{'algorithm_settings': None,\n", - " 'completion_time': None,\n", - " 'conditions': [{'last_transition_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'message': 'Suggestion is created',\n", - " 'reason': 'SuggestionCreated',\n", - " 'status': 'True',\n", - " 'type': 'Created'},\n", - " {'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': 'Suggestion is not running',\n", - " 'reason': 'Suggestion is succeeded',\n", - " 'status': 'False',\n", - " 'type': 'Running'},\n", - " {'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': 'Deployment is not ready',\n", - " 'reason': 'Suggestion is succeeded',\n", - " 'status': 'False',\n", - " 'type': 'DeploymentReady'},\n", - " {'last_transition_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 25, 14, 55, 58, tzinfo=tzlocal()),\n", - " 'message': \"Suggestion is succeeded, can't be restarted\",\n", - " 'reason': 'Experiment is succeeded',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}],\n", - " 'last_reconcile_time': None,\n", - " 'start_time': datetime.datetime(2024, 3, 25, 14, 53, 57, tzinfo=tzlocal()),\n", - " 'suggestion_count': 3,\n", - " 'suggestions': [{'early_stopping_rules': None,\n", - " 'labels': None,\n", - " 'name': 'cmaes-example-9pjzlnzc',\n", - " 'parameter_assignments': [{'name': 'lr',\n", - " 'value': '0.04188612100654'},\n", - " {'name': 'momentum',\n", - " 'value': '0.7043612817216396'}]},\n", - " {'early_stopping_rules': None,\n", - " 'labels': None,\n", - " 'name': 'cmaes-example-dphxbch7',\n", - " 'parameter_assignments': [{'name': 'lr',\n", - " 'value': '0.04511033252270099'},\n", - " {'name': 'momentum',\n", - " 'value': '0.6980954001565728'}]},\n", - " {'early_stopping_rules': None,\n", - " 'labels': None,\n", - " 'name': 'cmaes-example-7zhq4s49',\n", - " 'parameter_assignments': [{'name': 'lr',\n", - " 'value': '0.02556132716757138'},\n", - " {'name': 'momentum',\n", - " 'value': '0.701003503816815'}]}]}\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "suggestion = client.get_suggestion(name=EXPERIMENT_NAME)\n", "print(\"Suggestion:\", suggestion.metadata.name, end=\"\\n\\n\")\n", @@ -779,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "tags": [ "raises-exception" @@ -801,26 +506,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment user/cmaes-example has been deleted\n" - ] - } - ], + "outputs": [], "source": [ "client.delete_experiment(name=EXPERIMENT_NAME)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": { "tags": [] }, @@ -851,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": { "tags": [ "raises-exception" @@ -880,7 +577,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/tests/notebooks/kfp_v2/kfp-v2-integration.ipynb b/tests/notebooks/kfp_v2/kfp-v2-integration.ipynb index 0f50966..1d770d3 100644 --- a/tests/notebooks/kfp_v2/kfp-v2-integration.ipynb +++ b/tests/notebooks/kfp_v2/kfp-v2-integration.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "28f75e55-7bad-44e7-a65f-aedc81734a48", "metadata": { "tags": [ @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "4cdd7548-bae9-4430-b548-f420d72a8aec", "metadata": { "tags": [] @@ -47,28 +47,19 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "fd576641-1ff4-4fbb-9b3a-122abbd281ed", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.11/site-packages/kfp/client/client.py:159: FutureWarning: This client only works with Kubeflow Pipeline v2.0.0-beta.2 and later versions.\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "client = kfp.Client()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "af70bb9d-3fea-40d7-acb9-649007b0bde6", "metadata": { "tags": [] @@ -80,21 +71,60 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, + "id": "0e0bea73-b980-48b0-8c2f-0709af35798b", + "metadata": {}, + "outputs": [], + "source": [ + "HTTP_PROXY = HTTPS_PROXY = NO_PROXY = None\n", + "\n", + "if os.environ.get('HTTP_PROXY') and os.environ.get('HTTPS_PROXY') and os.environ.get('NO_PROXY'):\n", + " HTTP_PROXY = os.environ['HTTP_PROXY']\n", + " HTTPS_PROXY = os.environ['HTTPS_PROXY']\n", + " # add `.kubeflow` to NO_PROXY needed for pipelines\n", + " NO_PROXY = os.environ['NO_PROXY']+\",.kubeflow\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb28ac8a-4155-46ab-88a4-dc3f58c24003", + "metadata": {}, + "outputs": [], + "source": [ + "def add_proxy(obj, http_proxy=HTTP_PROXY, https_proxy=HTTPS_PROXY, no_proxy=NO_PROXY):\n", + " \"\"\"Adds the proxy env vars to the PipelineTask object.\"\"\"\n", + " return (\n", + " obj.set_env_variable(name='http_proxy', value=http_proxy)\n", + " .set_env_variable(name='https_proxy', value=https_proxy)\n", + " .set_env_variable(name='HTTP_PROXY', value=http_proxy)\n", + " .set_env_variable(name='HTTPS_PROXY', value=https_proxy)\n", + " .set_env_variable(name='no_proxy', value=no_proxy)\n", + " .set_env_variable(name='NO_PROXY', value=no_proxy)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd80a4ab-a444-42e0-94ae-ac2d5bd9d315", + "metadata": {}, + "outputs": [], + "source": [ + "def proxy_envs_set():\n", + " if HTTP_PROXY and HTTPS_PROXY and NO_PROXY:\n", + " return True\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "40a3a9e1-0645-474e-8451-92ccba88a122", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.11/site-packages/kfp/dsl/component_decorator.py:119: FutureWarning: Python 3.7 has reached end-of-life. The default base_image used by the @dsl.component decorator will switch from 'python:3.7' to 'python:3.8' on April 23, 2024. To ensure your existing components work with versions of the KFP SDK released after that date, you should provide an explicit base_image argument and ensure your component works as intended on Python 3.8.\n", - " return component_factory.create_component_from_func(\n" - ] - } - ], + "outputs": [], "source": [ "@dsl.component()\n", "def flip_coin(force_flip_result: str = '') -> str:\n", @@ -108,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "1d134c8b-54a7-4d10-ae2f-321ff305600a", "metadata": { "tags": [] @@ -123,21 +153,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "c8132d87-877c-4bfb-9127-e1f964fe3acb", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_356/2573238994.py:6: DeprecationWarning: dsl.Condition is deprecated. Please use dsl.If instead.\n", - " with dsl.Condition(flip1.output == 'heads'):\n" - ] - } - ], + "outputs": [], "source": [ "@dsl.pipeline(name='condition-v2')\n", "def condition_pipeline(text: str = 'condition test', force_flip_result: str = ''):\n", @@ -152,91 +173,63 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, + "id": "c04029ac-f284-4a13-a39c-6af783ec2b10", + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.pipeline(name='condition-v2')\n", + "def condition_pipeline_proxy(text: str = 'condition test', force_flip_result: str = ''):\n", + " flip1 = add_proxy(flip_coin(force_flip_result=force_flip_result))\n", + " add_proxy(print_msg(msg=flip1.output))\n", + "\n", + " with dsl.Condition(flip1.output == 'heads'):\n", + " flip2 = add_proxy(flip_coin())\n", + " add_proxy(print_msg(msg=flip2.output))\n", + " add_proxy(print_msg(msg=text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "b85cc961-b6cc-4434-a59d-31e4c8a6e175", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "Experiment details." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Run details." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "run = client.create_run_from_pipeline_func(\n", - " condition_pipeline,\n", - " experiment_name=EXPERIMENT_NAME,\n", - ")" + "if proxy_envs_set:\n", + " run = client.create_run_from_pipeline_func(\n", + " condition_pipeline_proxy,\n", + " experiment_name=EXPERIMENT_NAME,\n", + " )\n", + "else:\n", + " run = client.create_run_from_pipeline_func(\n", + " condition_pipeline,\n", + " experiment_name=EXPERIMENT_NAME,\n", + " )" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "37ebdc86-a16d-40a0-bc7e-33a2b90914f8", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'created_at': datetime.datetime(2023, 11, 21, 10, 35, tzinfo=tzlocal()),\n", - " 'description': None,\n", - " 'display_name': 'Flip a coin and output tails/heads pipeline',\n", - " 'experiment_id': '721a46c5-c6c9-4d28-af04-00a8503673ac',\n", - " 'namespace': 'daniela',\n", - " 'storage_state': 'AVAILABLE'}]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "client.list_experiments().experiments" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "3226c13b-9d08-47e7-812f-47529c02d9dc", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "'SUCCEEDED'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "client.get_run(run.run_id).state" ] @@ -277,6 +270,14 @@ "\n", "assert_run_succeeded(client, run.run_id)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eefaf73-53ab-4136-94c9-6b8e5006864a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -295,7 +296,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/tests/notebooks/training/training-integration.ipynb b/tests/notebooks/training/training-integration.ipynb index 32e0927..c2e65e6 100644 --- a/tests/notebooks/training/training-integration.ipynb +++ b/tests/notebooks/training/training-integration.ipynb @@ -1,897 +1,918 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test Training Operator Integration\n", - "\n", - "This example notebook is loosely based on the following upstream examples:\n", - "* [TFJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/tensorflow/image-classification/create-tfjob.ipynb)\n", - "* [PyTorchJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/pytorch/image-classification/create-pytorchjob.ipynb)\n", - "* [PaddleJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/paddlepaddle/simple-cpu.yaml)\n", - "\n", - "Note that the above can get out of sync with the actual testing upstream does, so make sure to also check out [upstream E2E tests](https://github.com/kubeflow/training-operator/tree/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/sdk/python/test/e2e) for updating the notebook.\n", - "\n", - "The workflow for each job (TFJob, PyTorchJob, and PaddleJob) is:\n", - "- create training job\n", - "- monitor its execution\n", - "- get training logs\n", - "- delete job" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "pytest-skip" - ] - }, - "outputs": [], - "source": [ - "# Please check the requirements.in file for more details\n", - "!pip install -r requirements.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import required packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from kubeflow.training import (\n", - " KubeflowOrgV1PaddleJob,\n", - " KubeflowOrgV1PaddleJobSpec,\n", - " KubeflowOrgV1PyTorchJob,\n", - " KubeflowOrgV1PyTorchJobSpec,\n", - " KubeflowOrgV1TFJob,\n", - " KubeflowOrgV1TFJobSpec,\n", - " TrainingClient,\n", - " V1ReplicaSpec,\n", - " V1RunPolicy,\n", - ")\n", - "from kubernetes.client import (\n", - " V1Container,\n", - " V1ContainerPort,\n", - " V1ObjectMeta,\n", - " V1PodSpec,\n", - " V1PodTemplateSpec,\n", - ")\n", - "from tenacity import retry, stop_after_attempt, wait_exponential" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialise Training Client\n", - "\n", - "We will be using the Training SDK for any actions executed as part of this example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = TrainingClient()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Helper to print training logs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def print_training_logs(client, job_name: str, container: str, is_master: bool = True):\n", - " logs = client.get_job_logs(name=job_name, container=container, is_master=is_master)\n", - " print(logs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Helper to check that Job succeeded" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=30),\n", - " stop=stop_after_attempt(50),\n", - " reraise=True,\n", - ")\n", - "def assert_job_succeeded(client, job_name, job_kind):\n", - " \"\"\"Wait for the Job to complete successfully.\"\"\"\n", - " assert client.is_job_succeeded(\n", - " name=job_name, job_kind=job_kind\n", - " ), f\"Job {job_name} was not successful.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test TFJob" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define a TFJob\n", - "\n", - "Define a TFJob object before deploying it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TFJOB_NAME = \"mnist\"\n", - "TFJOB_CONTAINER = \"tensorflow\"\n", - "TFJOB_IMAGE = \"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "container = V1Container(\n", - " name=TFJOB_CONTAINER,\n", - " image=TFJOB_IMAGE,\n", - " command=[\n", - " \"python\",\n", - " \"/var/tf_mnist/mnist_with_summaries.py\",\n", - " \"--log_dir=/train/logs\",\n", - " \"--learning_rate=0.01\",\n", - " \"--batch_size=150\",\n", - " ],\n", - ")\n", - "\n", - "worker = V1ReplicaSpec(\n", - " replicas=2,\n", - " restart_policy=\"Never\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "chief = V1ReplicaSpec(\n", - " replicas=1,\n", - " restart_policy=\"Never\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "ps = V1ReplicaSpec(\n", - " replicas=1,\n", - " restart_policy=\"Never\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "tfjob = KubeflowOrgV1TFJob(\n", - " api_version=\"kubeflow.org/v1\",\n", - " kind=\"TFJob\",\n", - " metadata=V1ObjectMeta(name=TFJOB_NAME),\n", - " spec=KubeflowOrgV1TFJobSpec(\n", - " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", - " tf_replica_specs={\"Worker\": worker, \"Chief\": chief, \"PS\": ps},\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Print the Job's info to verify it before submission." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Name:\", tfjob.metadata.name)\n", - "print(\"Spec:\", tfjob.spec.tf_replica_specs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### List existing TFJobs\n", - "\n", - "List TFJobs in the current namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[job.metadata.name for job in client.list_tfjobs()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create TFJob\n", - "\n", - "Create a TFJob using the SDK." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.create_tfjob(tfjob)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Get TFJob\n", - "Get the created TFJob by name and check its data. \n", - "Make sure that it completes successfully before proceeding. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# verify that the Job was created successfully\n", - "# raises an error if it doesn't exist\n", - "tfjob = client.get_tfjob(name=TFJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for the Job to complete successfully\n", - "assert_job_succeeded(client, TFJOB_NAME, job_kind=\"TFJob\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Job:\", tfjob.metadata.name, end=\"\\n\\n\")\n", - "print(\"Job Spec:\", tfjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", - "print(\"Job Status:\", tfjob.status, sep=\"\\n\", end=\"\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get TFJob Training logs\n", - "Get and print the training logs of the TFJob with the training steps " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print_training_logs(client, TFJOB_NAME, container=TFJOB_CONTAINER)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete TFJob\n", - "\n", - "Delete the created TFJob." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.delete_tfjob(name=TFJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=10),\n", - " stop=stop_after_attempt(30),\n", - " reraise=True,\n", - ")\n", - "def assert_tfjob_removed(client, job_name):\n", - " \"\"\"Wait for TFJob to be removed.\"\"\"\n", - " # fetch the existing TFJob names\n", - " # verify that the Job was deleted successfully\n", - " jobs = {job.metadata.name for job in client.list_tfjobs()}\n", - " assert job_name not in jobs, f\"Failed to delete TFJob {job_name}!\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for TFJob resources to be removed successfully\n", - "assert_tfjob_removed(client, TFJOB_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test PyTorchJob" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define a PyTorchJob\n", - "Define a PyTorchJob object before deploying it. This PyTorchJob is similar to [this](https://github.com/kubeflow/training-operator/blob/11b7a115e6538caeab405344af98f0d5b42a4c96/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb) example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "PYTORCHJOB_NAME = \"pytorch-mnist-gloo\"\n", - "PYTORCHJOB_CONTAINER = \"pytorch\"\n", - "PYTORCHJOB_IMAGE = \"kubeflowkatib/pytorch-mnist-cpu:v0.16.0\"\n", - "# The image above should be updated with each release with the corresponding Katib version used in CKF release.\n", - "# Note that instead of using the [image from training-operator repository](https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/mnist/Dockerfile),\n", - "# the one [from Katib](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu) is being used\n", - "# due to the large size of the first one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "container = V1Container(\n", - " name=PYTORCHJOB_CONTAINER,\n", - " image=PYTORCHJOB_IMAGE,\n", - " args=[\"--backend\", \"gloo\", \"--epochs\", \"2\"],\n", - " # Passing `epochs`argument since kubeflowkatib image defaults to 10.\n", - ")\n", - "\n", - "replica_spec = V1ReplicaSpec(\n", - " replicas=1,\n", - " restart_policy=\"OnFailure\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "pytorchjob = KubeflowOrgV1PyTorchJob(\n", - " api_version=\"kubeflow.org/v1\",\n", - " kind=\"PyTorchJob\",\n", - " metadata=V1ObjectMeta(name=PYTORCHJOB_NAME),\n", - " spec=KubeflowOrgV1PyTorchJobSpec(\n", - " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", - " pytorch_replica_specs={\"Master\": replica_spec, \"Worker\": replica_spec},\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Print the Job's info to verify it before submission." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Name:\", pytorchjob.metadata.name)\n", - "print(\"Spec:\", pytorchjob.spec.pytorch_replica_specs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### List existing PyTorchJobs\n", - "\n", - "List PyTorchJobs in the current namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[job.metadata.name for job in client.list_pytorchjobs()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create PyTorchJob\n", - "\n", - "Create a PyTorchJob using the SDK." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.create_pytorchjob(pytorchjob)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Get PyTorchJob\n", - "Get the created PyTorchJob by name and check its data. \n", - "Make sure that it completes successfully before proceeding. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# verify that the Job was created successfully\n", - "# raises an error if it doesn't exist\n", - "pytorchjob = client.get_pytorchjob(name=PYTORCHJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for the Job to complete successfully\n", - "assert_job_succeeded(client, PYTORCHJOB_NAME, job_kind=\"PyTorchJob\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Job:\", pytorchjob.metadata.name, end=\"\\n\\n\")\n", - "print(\"Job Spec:\", pytorchjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", - "print(\"Job Status:\", pytorchjob.status, sep=\"\\n\", end=\"\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get PyTorchJob Training logs\n", - "Get and print the training logs of the PyTorchJob with the training steps " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print_training_logs(client, PYTORCHJOB_NAME, container=PYTORCHJOB_CONTAINER)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete PyTorchJob\n", - "\n", - "Delete the created PyTorchJob." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.delete_pytorchjob(name=PYTORCHJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=10),\n", - " stop=stop_after_attempt(30),\n", - " reraise=True,\n", - ")\n", - "def assert_pytorchjob_removed(client, job_name):\n", - " \"\"\"Wait for PyTorchJob to be removed.\"\"\"\n", - " # fetch the existing PyTorchJob names\n", - " # verify that the Job was deleted successfully\n", - " jobs = {job.metadata.name for job in client.list_pytorchjobs()}\n", - " assert job_name not in jobs, f\"Failed to delete PyTorchJob {job_name}!\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for PyTorch job to be removed successfully\n", - "assert_pytorchjob_removed(client, PYTORCHJOB_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test PaddlePaddle" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define a PaddleJob\n", - "\n", - "Define a PaddleJob object before deploying it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "PADDLEJOB_NAME = \"paddle-simple-cpu\"\n", - "PADDLEJOB_CONTAINER = \"paddle\"\n", - "PADDLEJOB_IMAGE = \"docker.io/paddlepaddle/paddle:2.4.0rc0-cpu\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "port = V1ContainerPort(container_port=37777, name=\"master\")\n", - "\n", - "container = V1Container(\n", - " name=PADDLEJOB_CONTAINER,\n", - " image=PADDLEJOB_IMAGE,\n", - " command=[\"python\"],\n", - " args=[\"-m\", \"paddle.distributed.launch\", \"run_check\"],\n", - " ports=[port],\n", - ")\n", - "\n", - "replica_spec = V1ReplicaSpec(\n", - " replicas=2,\n", - " restart_policy=\"OnFailure\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "paddlejob = KubeflowOrgV1PaddleJob(\n", - " api_version=\"kubeflow.org/v1\",\n", - " kind=\"PaddleJob\",\n", - " metadata=V1ObjectMeta(name=PADDLEJOB_NAME),\n", - " spec=KubeflowOrgV1PaddleJobSpec(\n", - " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", - " paddle_replica_specs={\"Worker\": replica_spec},\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Print the Job's info to verify it before submission." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Name:\", paddlejob.metadata.name)\n", - "print(\"Spec:\", paddlejob.spec.paddle_replica_specs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### List existing PaddleJobs\n", - "\n", - "List PaddleJobs in the current namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[job.metadata.name for job in client.list_paddlejobs()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create PaddleJob\n", - "\n", - "Create a PaddleJob using the SDK." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.create_paddlejob(paddlejob)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Get PaddleJob\n", - "Get the created PaddleJob by name and check its data. \n", - "Make sure that it completes successfully before proceeding. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# verify that the Job was created successfully\n", - "# raises an error if it doesn't exist\n", - "paddlejob = client.get_paddlejob(name=PADDLEJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for the Job to complete successfully\n", - "assert_job_succeeded(client, PADDLEJOB_NAME, job_kind=\"PaddleJob\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Job:\", paddlejob.metadata.name, end=\"\\n\\n\")\n", - "print(\"Job Spec:\", paddlejob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", - "print(\"Job Status:\", paddlejob.status, sep=\"\\n\", end=\"\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get PaddleJob Training logs\n", - "Get and print the training logs of the PaddleJob with the training steps " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# set is_master to False because this example does not include a master replica type\n", - "print_training_logs(client, PADDLEJOB_NAME, container=PADDLEJOB_CONTAINER, is_master=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete PaddleJob\n", - "\n", - "Delete the created PaddleJob." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.delete_paddlejob(name=PADDLEJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=10),\n", - " stop=stop_after_attempt(30),\n", - " reraise=True,\n", - ")\n", - "def assert_paddlejob_removed(client, job_name):\n", - " \"\"\"Wait for PaddleJob to be removed.\"\"\"\n", - " # fetch the existing PaddleJob names\n", - " # verify that the Job was deleted successfully\n", - " jobs = {job.metadata.name for job in client.list_paddlejobs()}\n", - " assert job_name not in jobs, f\"Failed to delete PaddleJob {job_name}!\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for PaddleJob to be removed successfully\n", - "assert_paddlejob_removed(client, PADDLEJOB_NAME)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Training Operator Integration\n", + "\n", + "This example notebook is loosely based on the following upstream examples:\n", + "* [TFJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/tensorflow/image-classification/create-tfjob.ipynb)\n", + "* [PyTorchJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/pytorch/image-classification/create-pytorchjob.ipynb)\n", + "* [PaddleJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/paddlepaddle/simple-cpu.yaml)\n", + "\n", + "Note that the above can get out of sync with the actual testing upstream does, so make sure to also check out [upstream E2E tests](https://github.com/kubeflow/training-operator/tree/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/sdk/python/test/e2e) for updating the notebook.\n", + "\n", + "The workflow for each job (TFJob, PyTorchJob, and PaddleJob) is:\n", + "- create training job\n", + "- monitor its execution\n", + "- get training logs\n", + "- delete job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "pytest-skip" + ] + }, + "outputs": [], + "source": [ + "# Please check the requirements.in file for more details\n", + "!pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from kubeflow.training import (\n", + " KubeflowOrgV1PaddleJob,\n", + " KubeflowOrgV1PaddleJobSpec,\n", + " KubeflowOrgV1PyTorchJob,\n", + " KubeflowOrgV1PyTorchJobSpec,\n", + " KubeflowOrgV1TFJob,\n", + " KubeflowOrgV1TFJobSpec,\n", + " TrainingClient,\n", + " V1ReplicaSpec,\n", + " V1RunPolicy,\n", + ")\n", + "from kubernetes.client import (\n", + " V1Container,\n", + " V1ContainerPort,\n", + " V1ObjectMeta,\n", + " V1PodSpec,\n", + " V1PodTemplateSpec,\n", + ")\n", + "from tenacity import retry, stop_after_attempt, wait_exponential\n", + "\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialise Training Client\n", + "\n", + "We will be using the Training SDK for any actions executed as part of this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = TrainingClient()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Helper to print training logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_training_logs(client, job_name: str, container: str, is_master: bool = True):\n", + " logs = client.get_job_logs(name=job_name, container=container, is_master=is_master)\n", + " print(logs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Helper to check that Job succeeded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=30),\n", + " stop=stop_after_attempt(50),\n", + " reraise=True,\n", + ")\n", + "def assert_job_succeeded(client, job_name, job_kind):\n", + " \"\"\"Wait for the Job to complete successfully.\"\"\"\n", + " assert client.is_job_succeeded(\n", + " name=job_name, job_kind=job_kind\n", + " ), f\"Job {job_name} was not successful.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Helper to get the spec labels\n", + "This will add the label for the proxy PodDefault if the proxy envs are set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_labels = {}\n", + "if os.environ.get('HTTP_PROXY') and os.environ.get('HTTPS_PROXY') and os.environ.get('NO_PROXY'):\n", + " training_labels = {\"notebook-proxy\": \"true\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test TFJob" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a TFJob\n", + "\n", + "Define a TFJob object before deploying it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TFJOB_NAME = \"mnist\"\n", + "TFJOB_CONTAINER = \"tensorflow\"\n", + "TFJOB_IMAGE = \"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = V1Container(\n", + " name=TFJOB_CONTAINER,\n", + " image=TFJOB_IMAGE,\n", + " command=[\n", + " \"python\",\n", + " \"/var/tf_mnist/mnist_with_summaries.py\",\n", + " \"--log_dir=/train/logs\",\n", + " \"--learning_rate=0.01\",\n", + " \"--batch_size=150\",\n", + " ],\n", + ")\n", + "\n", + "worker = V1ReplicaSpec(\n", + " replicas=2,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "chief = V1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "ps = V1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "tfjob = KubeflowOrgV1TFJob(\n", + " api_version=\"kubeflow.org/v1\",\n", + " kind=\"TFJob\",\n", + " metadata=V1ObjectMeta(name=TFJOB_NAME),\n", + " spec=KubeflowOrgV1TFJobSpec(\n", + " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", + " tf_replica_specs={\"Worker\": worker, \"Chief\": chief, \"PS\": ps},\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the Job's info to verify it before submission." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Name:\", tfjob.metadata.name)\n", + "print(\"Spec:\", tfjob.spec.tf_replica_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List existing TFJobs\n", + "\n", + "List TFJobs in the current namespace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[job.metadata.name for job in client.list_tfjobs()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create TFJob\n", + "\n", + "Create a TFJob using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.create_tfjob(tfjob)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Get TFJob\n", + "Get the created TFJob by name and check its data. \n", + "Make sure that it completes successfully before proceeding. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# verify that the Job was created successfully\n", + "# raises an error if it doesn't exist\n", + "tfjob = client.get_tfjob(name=TFJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for the Job to complete successfully\n", + "assert_job_succeeded(client, TFJOB_NAME, job_kind=\"TFJob\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Job:\", tfjob.metadata.name, end=\"\\n\\n\")\n", + "print(\"Job Spec:\", tfjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", + "print(\"Job Status:\", tfjob.status, sep=\"\\n\", end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get TFJob Training logs\n", + "Get and print the training logs of the TFJob with the training steps " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_training_logs(client, TFJOB_NAME, container=TFJOB_CONTAINER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete TFJob\n", + "\n", + "Delete the created TFJob." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_tfjob(name=TFJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=10),\n", + " stop=stop_after_attempt(30),\n", + " reraise=True,\n", + ")\n", + "def assert_tfjob_removed(client, job_name):\n", + " \"\"\"Wait for TFJob to be removed.\"\"\"\n", + " # fetch the existing TFJob names\n", + " # verify that the Job was deleted successfully\n", + " jobs = {job.metadata.name for job in client.list_tfjobs()}\n", + " assert job_name not in jobs, f\"Failed to delete TFJob {job_name}!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for TFJob resources to be removed successfully\n", + "assert_tfjob_removed(client, TFJOB_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test PyTorchJob" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a PyTorchJob\n", + "Define a PyTorchJob object before deploying it. This PyTorchJob is similar to [this](https://github.com/kubeflow/training-operator/blob/11b7a115e6538caeab405344af98f0d5b42a4c96/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb) example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PYTORCHJOB_NAME = \"pytorch-mnist-gloo\"\n", + "PYTORCHJOB_CONTAINER = \"pytorch\"\n", + "PYTORCHJOB_IMAGE = \"kubeflowkatib/pytorch-mnist-cpu:v0.16.0\"\n", + "# The image above should be updated with each release with the corresponding Katib version used in CKF release.\n", + "# Note that instead of using the [image from training-operator repository](https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/mnist/Dockerfile),\n", + "# the one [from Katib](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu) is being used\n", + "# due to the large size of the first one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = V1Container(\n", + " name=PYTORCHJOB_CONTAINER,\n", + " image=PYTORCHJOB_IMAGE,\n", + " args=[\"--backend\", \"gloo\", \"--epochs\", \"2\"],\n", + " # Passing `epochs`argument since kubeflowkatib image defaults to 10.\n", + ")\n", + "\n", + "replica_spec = V1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"OnFailure\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "pytorchjob = KubeflowOrgV1PyTorchJob(\n", + " api_version=\"kubeflow.org/v1\",\n", + " kind=\"PyTorchJob\",\n", + " metadata=V1ObjectMeta(name=PYTORCHJOB_NAME),\n", + " spec=KubeflowOrgV1PyTorchJobSpec(\n", + " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", + " pytorch_replica_specs={\"Master\": replica_spec, \"Worker\": replica_spec},\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the Job's info to verify it before submission." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Name:\", pytorchjob.metadata.name)\n", + "print(\"Spec:\", pytorchjob.spec.pytorch_replica_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List existing PyTorchJobs\n", + "\n", + "List PyTorchJobs in the current namespace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[job.metadata.name for job in client.list_pytorchjobs()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create PyTorchJob\n", + "\n", + "Create a PyTorchJob using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.create_pytorchjob(pytorchjob)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Get PyTorchJob\n", + "Get the created PyTorchJob by name and check its data. \n", + "Make sure that it completes successfully before proceeding. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# verify that the Job was created successfully\n", + "# raises an error if it doesn't exist\n", + "pytorchjob = client.get_pytorchjob(name=PYTORCHJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for the Job to complete successfully\n", + "assert_job_succeeded(client, PYTORCHJOB_NAME, job_kind=\"PyTorchJob\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Job:\", pytorchjob.metadata.name, end=\"\\n\\n\")\n", + "print(\"Job Spec:\", pytorchjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", + "print(\"Job Status:\", pytorchjob.status, sep=\"\\n\", end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get PyTorchJob Training logs\n", + "Get and print the training logs of the PyTorchJob with the training steps " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_training_logs(client, PYTORCHJOB_NAME, container=PYTORCHJOB_CONTAINER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete PyTorchJob\n", + "\n", + "Delete the created PyTorchJob." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_pytorchjob(name=PYTORCHJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=10),\n", + " stop=stop_after_attempt(30),\n", + " reraise=True,\n", + ")\n", + "def assert_pytorchjob_removed(client, job_name):\n", + " \"\"\"Wait for PyTorchJob to be removed.\"\"\"\n", + " # fetch the existing PyTorchJob names\n", + " # verify that the Job was deleted successfully\n", + " jobs = {job.metadata.name for job in client.list_pytorchjobs()}\n", + " assert job_name not in jobs, f\"Failed to delete PyTorchJob {job_name}!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for PyTorch job to be removed successfully\n", + "assert_pytorchjob_removed(client, PYTORCHJOB_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test PaddlePaddle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a PaddleJob\n", + "\n", + "Define a PaddleJob object before deploying it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PADDLEJOB_NAME = \"paddle-simple-cpu\"\n", + "PADDLEJOB_CONTAINER = \"paddle\"\n", + "PADDLEJOB_IMAGE = \"docker.io/paddlepaddle/paddle:2.4.0rc0-cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "port = V1ContainerPort(container_port=37777, name=\"master\")\n", + "\n", + "container = V1Container(\n", + " name=PADDLEJOB_CONTAINER,\n", + " image=PADDLEJOB_IMAGE,\n", + " command=[\"python\"],\n", + " args=[\"-m\", \"paddle.distributed.launch\", \"run_check\"],\n", + " ports=[port],\n", + ")\n", + "\n", + "replica_spec = V1ReplicaSpec(\n", + " replicas=2,\n", + " restart_policy=\"OnFailure\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "paddlejob = KubeflowOrgV1PaddleJob(\n", + " api_version=\"kubeflow.org/v1\",\n", + " kind=\"PaddleJob\",\n", + " metadata=V1ObjectMeta(name=PADDLEJOB_NAME, labels=training_labels),\n", + " spec=KubeflowOrgV1PaddleJobSpec(\n", + " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", + " paddle_replica_specs={\"Worker\": replica_spec},\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the Job's info to verify it before submission." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Name:\", paddlejob.metadata.name)\n", + "print(\"Spec:\", paddlejob.spec.paddle_replica_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List existing PaddleJobs\n", + "\n", + "List PaddleJobs in the current namespace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[job.metadata.name for job in client.list_paddlejobs()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create PaddleJob\n", + "\n", + "Create a PaddleJob using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.create_paddlejob(paddlejob)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Get PaddleJob\n", + "Get the created PaddleJob by name and check its data. \n", + "Make sure that it completes successfully before proceeding. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# verify that the Job was created successfully\n", + "# raises an error if it doesn't exist\n", + "paddlejob = client.get_paddlejob(name=PADDLEJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for the Job to complete successfully\n", + "assert_job_succeeded(client, PADDLEJOB_NAME, job_kind=\"PaddleJob\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Job:\", paddlejob.metadata.name, end=\"\\n\\n\")\n", + "print(\"Job Spec:\", paddlejob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", + "print(\"Job Status:\", paddlejob.status, sep=\"\\n\", end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get PaddleJob Training logs\n", + "Get and print the training logs of the PaddleJob with the training steps " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set is_master to False because this example does not include a master replica type\n", + "print_training_logs(client, PADDLEJOB_NAME, container=PADDLEJOB_CONTAINER, is_master=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete PaddleJob\n", + "\n", + "Delete the created PaddleJob." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_paddlejob(name=PADDLEJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=10),\n", + " stop=stop_after_attempt(30),\n", + " reraise=True,\n", + ")\n", + "def assert_paddlejob_removed(client, job_name):\n", + " \"\"\"Wait for PaddleJob to be removed.\"\"\"\n", + " # fetch the existing PaddleJob names\n", + " # verify that the Job was deleted successfully\n", + " jobs = {job.metadata.name for job in client.list_paddlejobs()}\n", + " assert job_name not in jobs, f\"Failed to delete PaddleJob {job_name}!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for PaddleJob to be removed successfully\n", + "assert_paddlejob_removed(client, PADDLEJOB_NAME)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 + } diff --git a/tests/proxy-poddefault.yaml b/tests/proxy-poddefault.yaml new file mode 100644 index 0000000..a1f7300 --- /dev/null +++ b/tests/proxy-poddefault.yaml @@ -0,0 +1,22 @@ +apiVersion: kubeflow.org/v1alpha1 +kind: PodDefault +metadata: + name: notebook-proxy +spec: + desc: Add proxy settings + env: + - name: HTTP_PROXY + value: : + - name: http_proxy + value: : + - name: HTTPS_PROXY + value: : + - name: https_proxy + value: : + - name: NO_PROXY + value: ,,127.0.0.1,/24,,.svc,.local + - name: no_proxy + value: ,,127.0.0.1,/24,,.svc,.local + selector: + matchLabels: + notebook-proxy: "true" From 9861a407d213a5d70ec1c20ed78662b77a6ba3f1 Mon Sep 17 00:00:00 2001 From: Michal Hucko Date: Fri, 23 Aug 2024 12:11:35 +0200 Subject: [PATCH 2/5] Prefetch mnist dataset in tfjob to solve proxy problems (#105) * Prefetch mnist dataset in tfjob to solve proxy problems --- .../training/training-integration.ipynb | 1839 +++++++++-------- 1 file changed, 922 insertions(+), 917 deletions(-) diff --git a/tests/notebooks/training/training-integration.ipynb b/tests/notebooks/training/training-integration.ipynb index c2e65e6..9cc0b56 100644 --- a/tests/notebooks/training/training-integration.ipynb +++ b/tests/notebooks/training/training-integration.ipynb @@ -1,918 +1,923 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test Training Operator Integration\n", - "\n", - "This example notebook is loosely based on the following upstream examples:\n", - "* [TFJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/tensorflow/image-classification/create-tfjob.ipynb)\n", - "* [PyTorchJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/pytorch/image-classification/create-pytorchjob.ipynb)\n", - "* [PaddleJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/paddlepaddle/simple-cpu.yaml)\n", - "\n", - "Note that the above can get out of sync with the actual testing upstream does, so make sure to also check out [upstream E2E tests](https://github.com/kubeflow/training-operator/tree/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/sdk/python/test/e2e) for updating the notebook.\n", - "\n", - "The workflow for each job (TFJob, PyTorchJob, and PaddleJob) is:\n", - "- create training job\n", - "- monitor its execution\n", - "- get training logs\n", - "- delete job" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "pytest-skip" - ] - }, - "outputs": [], - "source": [ - "# Please check the requirements.in file for more details\n", - "!pip install -r requirements.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import required packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from kubeflow.training import (\n", - " KubeflowOrgV1PaddleJob,\n", - " KubeflowOrgV1PaddleJobSpec,\n", - " KubeflowOrgV1PyTorchJob,\n", - " KubeflowOrgV1PyTorchJobSpec,\n", - " KubeflowOrgV1TFJob,\n", - " KubeflowOrgV1TFJobSpec,\n", - " TrainingClient,\n", - " V1ReplicaSpec,\n", - " V1RunPolicy,\n", - ")\n", - "from kubernetes.client import (\n", - " V1Container,\n", - " V1ContainerPort,\n", - " V1ObjectMeta,\n", - " V1PodSpec,\n", - " V1PodTemplateSpec,\n", - ")\n", - "from tenacity import retry, stop_after_attempt, wait_exponential\n", - "\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialise Training Client\n", - "\n", - "We will be using the Training SDK for any actions executed as part of this example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = TrainingClient()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Helper to print training logs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def print_training_logs(client, job_name: str, container: str, is_master: bool = True):\n", - " logs = client.get_job_logs(name=job_name, container=container, is_master=is_master)\n", - " print(logs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Helper to check that Job succeeded" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=30),\n", - " stop=stop_after_attempt(50),\n", - " reraise=True,\n", - ")\n", - "def assert_job_succeeded(client, job_name, job_kind):\n", - " \"\"\"Wait for the Job to complete successfully.\"\"\"\n", - " assert client.is_job_succeeded(\n", - " name=job_name, job_kind=job_kind\n", - " ), f\"Job {job_name} was not successful.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Helper to get the spec labels\n", - "This will add the label for the proxy PodDefault if the proxy envs are set" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_labels = {}\n", - "if os.environ.get('HTTP_PROXY') and os.environ.get('HTTPS_PROXY') and os.environ.get('NO_PROXY'):\n", - " training_labels = {\"notebook-proxy\": \"true\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test TFJob" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define a TFJob\n", - "\n", - "Define a TFJob object before deploying it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TFJOB_NAME = \"mnist\"\n", - "TFJOB_CONTAINER = \"tensorflow\"\n", - "TFJOB_IMAGE = \"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "container = V1Container(\n", - " name=TFJOB_CONTAINER,\n", - " image=TFJOB_IMAGE,\n", - " command=[\n", - " \"python\",\n", - " \"/var/tf_mnist/mnist_with_summaries.py\",\n", - " \"--log_dir=/train/logs\",\n", - " \"--learning_rate=0.01\",\n", - " \"--batch_size=150\",\n", - " ],\n", - ")\n", - "\n", - "worker = V1ReplicaSpec(\n", - " replicas=2,\n", - " restart_policy=\"Never\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "chief = V1ReplicaSpec(\n", - " replicas=1,\n", - " restart_policy=\"Never\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "ps = V1ReplicaSpec(\n", - " replicas=1,\n", - " restart_policy=\"Never\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "tfjob = KubeflowOrgV1TFJob(\n", - " api_version=\"kubeflow.org/v1\",\n", - " kind=\"TFJob\",\n", - " metadata=V1ObjectMeta(name=TFJOB_NAME),\n", - " spec=KubeflowOrgV1TFJobSpec(\n", - " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", - " tf_replica_specs={\"Worker\": worker, \"Chief\": chief, \"PS\": ps},\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Print the Job's info to verify it before submission." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Name:\", tfjob.metadata.name)\n", - "print(\"Spec:\", tfjob.spec.tf_replica_specs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### List existing TFJobs\n", - "\n", - "List TFJobs in the current namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[job.metadata.name for job in client.list_tfjobs()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create TFJob\n", - "\n", - "Create a TFJob using the SDK." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.create_tfjob(tfjob)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Get TFJob\n", - "Get the created TFJob by name and check its data. \n", - "Make sure that it completes successfully before proceeding. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# verify that the Job was created successfully\n", - "# raises an error if it doesn't exist\n", - "tfjob = client.get_tfjob(name=TFJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for the Job to complete successfully\n", - "assert_job_succeeded(client, TFJOB_NAME, job_kind=\"TFJob\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Job:\", tfjob.metadata.name, end=\"\\n\\n\")\n", - "print(\"Job Spec:\", tfjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", - "print(\"Job Status:\", tfjob.status, sep=\"\\n\", end=\"\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get TFJob Training logs\n", - "Get and print the training logs of the TFJob with the training steps " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print_training_logs(client, TFJOB_NAME, container=TFJOB_CONTAINER)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete TFJob\n", - "\n", - "Delete the created TFJob." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.delete_tfjob(name=TFJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=10),\n", - " stop=stop_after_attempt(30),\n", - " reraise=True,\n", - ")\n", - "def assert_tfjob_removed(client, job_name):\n", - " \"\"\"Wait for TFJob to be removed.\"\"\"\n", - " # fetch the existing TFJob names\n", - " # verify that the Job was deleted successfully\n", - " jobs = {job.metadata.name for job in client.list_tfjobs()}\n", - " assert job_name not in jobs, f\"Failed to delete TFJob {job_name}!\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for TFJob resources to be removed successfully\n", - "assert_tfjob_removed(client, TFJOB_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test PyTorchJob" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define a PyTorchJob\n", - "Define a PyTorchJob object before deploying it. This PyTorchJob is similar to [this](https://github.com/kubeflow/training-operator/blob/11b7a115e6538caeab405344af98f0d5b42a4c96/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb) example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "PYTORCHJOB_NAME = \"pytorch-mnist-gloo\"\n", - "PYTORCHJOB_CONTAINER = \"pytorch\"\n", - "PYTORCHJOB_IMAGE = \"kubeflowkatib/pytorch-mnist-cpu:v0.16.0\"\n", - "# The image above should be updated with each release with the corresponding Katib version used in CKF release.\n", - "# Note that instead of using the [image from training-operator repository](https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/mnist/Dockerfile),\n", - "# the one [from Katib](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu) is being used\n", - "# due to the large size of the first one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "container = V1Container(\n", - " name=PYTORCHJOB_CONTAINER,\n", - " image=PYTORCHJOB_IMAGE,\n", - " args=[\"--backend\", \"gloo\", \"--epochs\", \"2\"],\n", - " # Passing `epochs`argument since kubeflowkatib image defaults to 10.\n", - ")\n", - "\n", - "replica_spec = V1ReplicaSpec(\n", - " replicas=1,\n", - " restart_policy=\"OnFailure\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "pytorchjob = KubeflowOrgV1PyTorchJob(\n", - " api_version=\"kubeflow.org/v1\",\n", - " kind=\"PyTorchJob\",\n", - " metadata=V1ObjectMeta(name=PYTORCHJOB_NAME),\n", - " spec=KubeflowOrgV1PyTorchJobSpec(\n", - " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", - " pytorch_replica_specs={\"Master\": replica_spec, \"Worker\": replica_spec},\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Print the Job's info to verify it before submission." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Name:\", pytorchjob.metadata.name)\n", - "print(\"Spec:\", pytorchjob.spec.pytorch_replica_specs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### List existing PyTorchJobs\n", - "\n", - "List PyTorchJobs in the current namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[job.metadata.name for job in client.list_pytorchjobs()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create PyTorchJob\n", - "\n", - "Create a PyTorchJob using the SDK." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.create_pytorchjob(pytorchjob)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Get PyTorchJob\n", - "Get the created PyTorchJob by name and check its data. \n", - "Make sure that it completes successfully before proceeding. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# verify that the Job was created successfully\n", - "# raises an error if it doesn't exist\n", - "pytorchjob = client.get_pytorchjob(name=PYTORCHJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for the Job to complete successfully\n", - "assert_job_succeeded(client, PYTORCHJOB_NAME, job_kind=\"PyTorchJob\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Job:\", pytorchjob.metadata.name, end=\"\\n\\n\")\n", - "print(\"Job Spec:\", pytorchjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", - "print(\"Job Status:\", pytorchjob.status, sep=\"\\n\", end=\"\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get PyTorchJob Training logs\n", - "Get and print the training logs of the PyTorchJob with the training steps " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print_training_logs(client, PYTORCHJOB_NAME, container=PYTORCHJOB_CONTAINER)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete PyTorchJob\n", - "\n", - "Delete the created PyTorchJob." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.delete_pytorchjob(name=PYTORCHJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=10),\n", - " stop=stop_after_attempt(30),\n", - " reraise=True,\n", - ")\n", - "def assert_pytorchjob_removed(client, job_name):\n", - " \"\"\"Wait for PyTorchJob to be removed.\"\"\"\n", - " # fetch the existing PyTorchJob names\n", - " # verify that the Job was deleted successfully\n", - " jobs = {job.metadata.name for job in client.list_pytorchjobs()}\n", - " assert job_name not in jobs, f\"Failed to delete PyTorchJob {job_name}!\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for PyTorch job to be removed successfully\n", - "assert_pytorchjob_removed(client, PYTORCHJOB_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test PaddlePaddle" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define a PaddleJob\n", - "\n", - "Define a PaddleJob object before deploying it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "PADDLEJOB_NAME = \"paddle-simple-cpu\"\n", - "PADDLEJOB_CONTAINER = \"paddle\"\n", - "PADDLEJOB_IMAGE = \"docker.io/paddlepaddle/paddle:2.4.0rc0-cpu\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "port = V1ContainerPort(container_port=37777, name=\"master\")\n", - "\n", - "container = V1Container(\n", - " name=PADDLEJOB_CONTAINER,\n", - " image=PADDLEJOB_IMAGE,\n", - " command=[\"python\"],\n", - " args=[\"-m\", \"paddle.distributed.launch\", \"run_check\"],\n", - " ports=[port],\n", - ")\n", - "\n", - "replica_spec = V1ReplicaSpec(\n", - " replicas=2,\n", - " restart_policy=\"OnFailure\",\n", - " template=V1PodTemplateSpec(\n", - " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", - " spec=V1PodSpec(containers=[container]),\n", - " ),\n", - ")\n", - "\n", - "paddlejob = KubeflowOrgV1PaddleJob(\n", - " api_version=\"kubeflow.org/v1\",\n", - " kind=\"PaddleJob\",\n", - " metadata=V1ObjectMeta(name=PADDLEJOB_NAME, labels=training_labels),\n", - " spec=KubeflowOrgV1PaddleJobSpec(\n", - " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", - " paddle_replica_specs={\"Worker\": replica_spec},\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Print the Job's info to verify it before submission." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Name:\", paddlejob.metadata.name)\n", - "print(\"Spec:\", paddlejob.spec.paddle_replica_specs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### List existing PaddleJobs\n", - "\n", - "List PaddleJobs in the current namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[job.metadata.name for job in client.list_paddlejobs()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create PaddleJob\n", - "\n", - "Create a PaddleJob using the SDK." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.create_paddlejob(paddlejob)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Get PaddleJob\n", - "Get the created PaddleJob by name and check its data. \n", - "Make sure that it completes successfully before proceeding. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# verify that the Job was created successfully\n", - "# raises an error if it doesn't exist\n", - "paddlejob = client.get_paddlejob(name=PADDLEJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for the Job to complete successfully\n", - "assert_job_succeeded(client, PADDLEJOB_NAME, job_kind=\"PaddleJob\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(\"Job:\", paddlejob.metadata.name, end=\"\\n\\n\")\n", - "print(\"Job Spec:\", paddlejob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", - "print(\"Job Status:\", paddlejob.status, sep=\"\\n\", end=\"\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get PaddleJob Training logs\n", - "Get and print the training logs of the PaddleJob with the training steps " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# set is_master to False because this example does not include a master replica type\n", - "print_training_logs(client, PADDLEJOB_NAME, container=PADDLEJOB_CONTAINER, is_master=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete PaddleJob\n", - "\n", - "Delete the created PaddleJob." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.delete_paddlejob(name=PADDLEJOB_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@retry(\n", - " wait=wait_exponential(multiplier=2, min=1, max=10),\n", - " stop=stop_after_attempt(30),\n", - " reraise=True,\n", - ")\n", - "def assert_paddlejob_removed(client, job_name):\n", - " \"\"\"Wait for PaddleJob to be removed.\"\"\"\n", - " # fetch the existing PaddleJob names\n", - " # verify that the Job was deleted successfully\n", - " jobs = {job.metadata.name for job in client.list_paddlejobs()}\n", - " assert job_name not in jobs, f\"Failed to delete PaddleJob {job_name}!\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "raises-exception" - ] - }, - "outputs": [], - "source": [ - "# wait for PaddleJob to be removed successfully\n", - "assert_paddlejob_removed(client, PADDLEJOB_NAME)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Training Operator Integration\n", + "\n", + "This example notebook is loosely based on the following upstream examples:\n", + "* [TFJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/tensorflow/image-classification/create-tfjob.ipynb)\n", + "* [PyTorchJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/pytorch/image-classification/create-pytorchjob.ipynb)\n", + "* [PaddleJob](https://github.com/kubeflow/training-operator/blob/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/examples/paddlepaddle/simple-cpu.yaml)\n", + "\n", + "Note that the above can get out of sync with the actual testing upstream does, so make sure to also check out [upstream E2E tests](https://github.com/kubeflow/training-operator/tree/964a6e836eedff11edfe79cc9f4e5b7c623cbe88/sdk/python/test/e2e) for updating the notebook.\n", + "\n", + "The workflow for each job (TFJob, PyTorchJob, and PaddleJob) is:\n", + "- create training job\n", + "- monitor its execution\n", + "- get training logs\n", + "- delete job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "pytest-skip" + ] + }, + "outputs": [], + "source": [ + "# Please check the requirements.in file for more details\n", + "!pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from kubeflow.training import (\n", + " KubeflowOrgV1PaddleJob,\n", + " KubeflowOrgV1PaddleJobSpec,\n", + " KubeflowOrgV1PyTorchJob,\n", + " KubeflowOrgV1PyTorchJobSpec,\n", + " KubeflowOrgV1TFJob,\n", + " KubeflowOrgV1TFJobSpec,\n", + " TrainingClient,\n", + " V1ReplicaSpec,\n", + " V1RunPolicy,\n", + ")\n", + "from kubernetes.client import (\n", + " V1Container,\n", + " V1ContainerPort,\n", + " V1ObjectMeta,\n", + " V1PodSpec,\n", + " V1PodTemplateSpec,\n", + ")\n", + "from tenacity import retry, stop_after_attempt, wait_exponential\n", + "\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialise Training Client\n", + "\n", + "We will be using the Training SDK for any actions executed as part of this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = TrainingClient()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Helper to print training logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_training_logs(client, job_name: str, container: str, is_master: bool = True):\n", + " logs = client.get_job_logs(name=job_name, container=container, is_master=is_master)\n", + " print(logs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Helper to check that Job succeeded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=30),\n", + " stop=stop_after_attempt(50),\n", + " reraise=True,\n", + ")\n", + "def assert_job_succeeded(client, job_name, job_kind):\n", + " \"\"\"Wait for the Job to complete successfully.\"\"\"\n", + " assert client.is_job_succeeded(\n", + " name=job_name, job_kind=job_kind\n", + " ), f\"Job {job_name} was not successful.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Helper to get the spec labels\n", + "This will add the label for the proxy PodDefault if the proxy envs are set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_labels = {}\n", + "if os.environ.get('HTTP_PROXY') and os.environ.get('HTTPS_PROXY') and os.environ.get('NO_PROXY'):\n", + " training_labels = {\"notebook-proxy\": \"true\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test TFJob" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a TFJob\n", + "\n", + "Define a TFJob object before deploying it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TFJOB_NAME = \"mnist\"\n", + "TFJOB_CONTAINER = \"tensorflow\"\n", + "TFJOB_IMAGE = \"kubeflow/tf-mnist-with-summaries:latest\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = V1Container(\n", + " name=TFJOB_CONTAINER,\n", + " image=TFJOB_IMAGE,\n", + " command=[\n", + " \"sh\", \"-c\",\n", + " # Download MNIST dataset using curl to overcome proxy problems https://github.com/canonical/training-operator/issues/182\n", + " # Mnist data loads from /tmp/tensorflow/mnist/input_data [see reference in mnist_with_summaries.py](https://github.com/kubeflow/training-operator/blob/master/examples/tensorflow/mnist_with_summaries/mnist_with_summaries.py#L213)\n", + " \"mkdir -p /tmp/tensorflow/mnist/input_data/ && \" +\n", + " \"curl -L -o /tmp/tensorflow/mnist/input_data/train-images-idx3-ubyte.gz https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/train-images-idx3-ubyte.gz && \" +\n", + " \"curl -L -o /tmp/tensorflow/mnist/input_data/train-labels-idx1-ubyte.gz https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/train-labels-idx1-ubyte.gz && \" +\n", + " \"curl -L -o /tmp/tensorflow/mnist/input_data/t10k-images-idx3-ubyte.gz https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/t10k-images-idx3-ubyte.gz && \" +\n", + " \"curl -L -o /tmp/tensorflow/mnist/input_data/t10k-labels-idx1-ubyte.gz https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/t10k-labels-idx1-ubyte.gz && \" +\n", + " # Run the TensorFlow script after downloading the dataset\n", + " \"python /var/tf_mnist/mnist_with_summaries.py --log_dir=/train/logs --learning_rate=0.01 --batch_size=150\"\n", + " ],\n", + ")\n", + "\n", + "worker = V1ReplicaSpec(\n", + " replicas=2,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "chief = V1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "ps = V1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "tfjob = KubeflowOrgV1TFJob(\n", + " api_version=\"kubeflow.org/v1\",\n", + " kind=\"TFJob\",\n", + " metadata=V1ObjectMeta(name=TFJOB_NAME),\n", + " spec=KubeflowOrgV1TFJobSpec(\n", + " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", + " tf_replica_specs={\"Worker\": worker, \"Chief\": chief, \"PS\": ps},\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the Job's info to verify it before submission." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Name:\", tfjob.metadata.name)\n", + "print(\"Spec:\", tfjob.spec.tf_replica_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List existing TFJobs\n", + "\n", + "List TFJobs in the current namespace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[job.metadata.name for job in client.list_tfjobs()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create TFJob\n", + "\n", + "Create a TFJob using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.create_tfjob(tfjob)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Get TFJob\n", + "Get the created TFJob by name and check its data. \n", + "Make sure that it completes successfully before proceeding. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# verify that the Job was created successfully\n", + "# raises an error if it doesn't exist\n", + "tfjob = client.get_tfjob(name=TFJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for the Job to complete successfully\n", + "assert_job_succeeded(client, TFJOB_NAME, job_kind=\"TFJob\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Job:\", tfjob.metadata.name, end=\"\\n\\n\")\n", + "print(\"Job Spec:\", tfjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", + "print(\"Job Status:\", tfjob.status, sep=\"\\n\", end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get TFJob Training logs\n", + "Get and print the training logs of the TFJob with the training steps " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_training_logs(client, TFJOB_NAME, container=TFJOB_CONTAINER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete TFJob\n", + "\n", + "Delete the created TFJob." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_tfjob(name=TFJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=10),\n", + " stop=stop_after_attempt(30),\n", + " reraise=True,\n", + ")\n", + "def assert_tfjob_removed(client, job_name):\n", + " \"\"\"Wait for TFJob to be removed.\"\"\"\n", + " # fetch the existing TFJob names\n", + " # verify that the Job was deleted successfully\n", + " jobs = {job.metadata.name for job in client.list_tfjobs()}\n", + " assert job_name not in jobs, f\"Failed to delete TFJob {job_name}!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for TFJob resources to be removed successfully\n", + "assert_tfjob_removed(client, TFJOB_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test PyTorchJob" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a PyTorchJob\n", + "Define a PyTorchJob object before deploying it. This PyTorchJob is similar to [this](https://github.com/kubeflow/training-operator/blob/11b7a115e6538caeab405344af98f0d5b42a4c96/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb) example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PYTORCHJOB_NAME = \"pytorch-mnist-gloo\"\n", + "PYTORCHJOB_CONTAINER = \"pytorch\"\n", + "PYTORCHJOB_IMAGE = \"kubeflowkatib/pytorch-mnist-cpu:v0.16.0\"\n", + "# The image above should be updated with each release with the corresponding Katib version used in CKF release.\n", + "# Note that instead of using the [image from training-operator repository](https://github.com/kubeflow/training-operator/blob/master/examples/pytorch/mnist/Dockerfile),\n", + "# the one [from Katib](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu) is being used\n", + "# due to the large size of the first one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = V1Container(\n", + " name=PYTORCHJOB_CONTAINER,\n", + " image=PYTORCHJOB_IMAGE,\n", + " args=[\"--backend\", \"gloo\", \"--epochs\", \"2\"],\n", + " # Passing `epochs`argument since kubeflowkatib image defaults to 10.\n", + ")\n", + "\n", + "replica_spec = V1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"OnFailure\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}, labels=training_labels),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "pytorchjob = KubeflowOrgV1PyTorchJob(\n", + " api_version=\"kubeflow.org/v1\",\n", + " kind=\"PyTorchJob\",\n", + " metadata=V1ObjectMeta(name=PYTORCHJOB_NAME),\n", + " spec=KubeflowOrgV1PyTorchJobSpec(\n", + " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", + " pytorch_replica_specs={\"Master\": replica_spec, \"Worker\": replica_spec},\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the Job's info to verify it before submission." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Name:\", pytorchjob.metadata.name)\n", + "print(\"Spec:\", pytorchjob.spec.pytorch_replica_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List existing PyTorchJobs\n", + "\n", + "List PyTorchJobs in the current namespace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[job.metadata.name for job in client.list_pytorchjobs()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create PyTorchJob\n", + "\n", + "Create a PyTorchJob using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.create_pytorchjob(pytorchjob)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Get PyTorchJob\n", + "Get the created PyTorchJob by name and check its data. \n", + "Make sure that it completes successfully before proceeding. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# verify that the Job was created successfully\n", + "# raises an error if it doesn't exist\n", + "pytorchjob = client.get_pytorchjob(name=PYTORCHJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for the Job to complete successfully\n", + "assert_job_succeeded(client, PYTORCHJOB_NAME, job_kind=\"PyTorchJob\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Job:\", pytorchjob.metadata.name, end=\"\\n\\n\")\n", + "print(\"Job Spec:\", pytorchjob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", + "print(\"Job Status:\", pytorchjob.status, sep=\"\\n\", end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get PyTorchJob Training logs\n", + "Get and print the training logs of the PyTorchJob with the training steps " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_training_logs(client, PYTORCHJOB_NAME, container=PYTORCHJOB_CONTAINER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete PyTorchJob\n", + "\n", + "Delete the created PyTorchJob." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_pytorchjob(name=PYTORCHJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=10),\n", + " stop=stop_after_attempt(30),\n", + " reraise=True,\n", + ")\n", + "def assert_pytorchjob_removed(client, job_name):\n", + " \"\"\"Wait for PyTorchJob to be removed.\"\"\"\n", + " # fetch the existing PyTorchJob names\n", + " # verify that the Job was deleted successfully\n", + " jobs = {job.metadata.name for job in client.list_pytorchjobs()}\n", + " assert job_name not in jobs, f\"Failed to delete PyTorchJob {job_name}!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for PyTorch job to be removed successfully\n", + "assert_pytorchjob_removed(client, PYTORCHJOB_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test PaddlePaddle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a PaddleJob\n", + "\n", + "Define a PaddleJob object before deploying it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PADDLEJOB_NAME = \"paddle-simple-cpu\"\n", + "PADDLEJOB_CONTAINER = \"paddle\"\n", + "PADDLEJOB_IMAGE = \"docker.io/paddlepaddle/paddle:2.4.0rc0-cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "port = V1ContainerPort(container_port=37777, name=\"master\")\n", + "\n", + "container = V1Container(\n", + " name=PADDLEJOB_CONTAINER,\n", + " image=PADDLEJOB_IMAGE,\n", + " command=[\"python\"],\n", + " args=[\"-m\", \"paddle.distributed.launch\", \"run_check\"],\n", + " ports=[port],\n", + ")\n", + "\n", + "replica_spec = V1ReplicaSpec(\n", + " replicas=2,\n", + " restart_policy=\"OnFailure\",\n", + " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(annotations={\"sidecar.istio.io/inject\": \"false\"}),\n", + " spec=V1PodSpec(containers=[container]),\n", + " ),\n", + ")\n", + "\n", + "paddlejob = KubeflowOrgV1PaddleJob(\n", + " api_version=\"kubeflow.org/v1\",\n", + " kind=\"PaddleJob\",\n", + " metadata=V1ObjectMeta(name=PADDLEJOB_NAME, labels=training_labels),\n", + " spec=KubeflowOrgV1PaddleJobSpec(\n", + " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", + " paddle_replica_specs={\"Worker\": replica_spec},\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the Job's info to verify it before submission." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Name:\", paddlejob.metadata.name)\n", + "print(\"Spec:\", paddlejob.spec.paddle_replica_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List existing PaddleJobs\n", + "\n", + "List PaddleJobs in the current namespace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[job.metadata.name for job in client.list_paddlejobs()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create PaddleJob\n", + "\n", + "Create a PaddleJob using the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.create_paddlejob(paddlejob)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Get PaddleJob\n", + "Get the created PaddleJob by name and check its data. \n", + "Make sure that it completes successfully before proceeding. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# verify that the Job was created successfully\n", + "# raises an error if it doesn't exist\n", + "paddlejob = client.get_paddlejob(name=PADDLEJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for the Job to complete successfully\n", + "assert_job_succeeded(client, PADDLEJOB_NAME, job_kind=\"PaddleJob\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"Job:\", paddlejob.metadata.name, end=\"\\n\\n\")\n", + "print(\"Job Spec:\", paddlejob.spec, sep=\"\\n\", end=\"\\n\\n\")\n", + "print(\"Job Status:\", paddlejob.status, sep=\"\\n\", end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get PaddleJob Training logs\n", + "Get and print the training logs of the PaddleJob with the training steps " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set is_master to False because this example does not include a master replica type\n", + "print_training_logs(client, PADDLEJOB_NAME, container=PADDLEJOB_CONTAINER, is_master=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete PaddleJob\n", + "\n", + "Delete the created PaddleJob." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_paddlejob(name=PADDLEJOB_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@retry(\n", + " wait=wait_exponential(multiplier=2, min=1, max=10),\n", + " stop=stop_after_attempt(30),\n", + " reraise=True,\n", + ")\n", + "def assert_paddlejob_removed(client, job_name):\n", + " \"\"\"Wait for PaddleJob to be removed.\"\"\"\n", + " # fetch the existing PaddleJob names\n", + " # verify that the Job was deleted successfully\n", + " jobs = {job.metadata.name for job in client.list_paddlejobs()}\n", + " assert job_name not in jobs, f\"Failed to delete PaddleJob {job_name}!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "# wait for PaddleJob to be removed successfully\n", + "assert_paddlejob_removed(client, PADDLEJOB_NAME)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 66fe640dccbc57377cc256c07bbd0262a8649fb9 Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Fri, 23 Aug 2024 17:06:56 +0200 Subject: [PATCH 3/5] tests: add proxy option to test file for driver (#103) * tests: add proxy option to test file for driver Adding a proxy option to the driver/test_kubeflow_workloads.py script, will allow users to tell the driver it will run in a proxied environment. With this, a PodDefault will be used for setting environment variables (NO_PROXY, HTTP_PROXY, HTTPS_PROXY) in the Pod(s) that execute the Job for UATs. Fixes #96 --- README.md | 62 +++++++++++++++++++++---------- assets/test-job.yaml.j2 | 3 ++ driver/conftest.py | 11 ++++++ driver/test_kubeflow_workloads.py | 57 +++++++++++++++++++++++++++- tests/proxy-poddefault.yaml | 22 ----------- tests/proxy-poddefault.yaml.j2 | 45 ++++++++++++++++++++++ 6 files changed, 156 insertions(+), 44 deletions(-) delete mode 100644 tests/proxy-poddefault.yaml create mode 100644 tests/proxy-poddefault.yaml.j2 diff --git a/README.md b/README.md index 837f93e..c5af1ad 100644 --- a/README.md +++ b/README.md @@ -140,30 +140,44 @@ tox -e kubeflow-local ``` ### Run behind proxy -#### Prerequistes -**To run the tests behind proxy using Notebook or using the driver, the following step is necessary:** + +#### Running using Notebook + +##### Prerequistes Edit the PodDefault `tests/proxy-poddefault.yaml` to replace the placeholders for: - * `:`: The address and port of your proxy server - * ``: you can get this value by running: - ``` - cat /var/snap/microk8s/current/args/kube-proxy | grep cluster-cidr - ``` - * ``: you can get this value by running: - ``` - cat /var/snap/microk8s/current/args/kube-apiserver | grep service-cluster-ip-range - ``` + +* `http_proxy` and `https_proxy` - The address and port of your proxy server, format should be `:` +* `no_proxy` - A comma separated list of items that should not be proxied. It is recommended to include the following: + +`,,127.0.0.1,localhost,/24,,.svc,.local` + +where, + + * ``: you can get this value by running: + + ``` + cat /var/snap/microk8s/current/args/kube-proxy | grep cluster-cidr + ``` + + * ``: you can get this value by running: + + ``` + cat /var/snap/microk8s/current/args/kube-apiserver | grep service-cluster-ip-range + ``` - * ``: the Internal IP of the nodes where your cluster is running, you can - get this value by running: - ``` - microk8s kubectl get nodes -o wide - ``` - It is the `INTERNAL-IP` value - * ``: the name of your host on which the cluster is deployed, you can use the - `hostname` command to get it + * ``: the Internal IP of the nodes where your cluster is running, you can get this value by running: + + ``` + microk8s kubectl get nodes -o wide + ``` + It is the `INTERNAL-IP` value + + * ``: the name of your host on which the cluster is deployed, you can use the `hostname` command to get it + + * `localhost` and `127.0.0.1` are recommended to avoid proxying requests to `localhost` + -#### Running using Notebook To run the tests behind proxy using Notebook: 1. Login to the Dashboard and Create a Profile 2. Apply the PodDefault to your Profile's namespace, make sure you already followed the Prerequisites @@ -188,6 +202,14 @@ To run the tests behind proxy using Notebook: * kfp_v2 * training (except TFJob due to https://github.com/canonical/training-operator/issues/182) +#### Running using `driver` + +You can pass the `--proxy` flag and set the values for proxies to the tox command and this should automatically apply the required changes to run behind proxy. + +```bash +tox -e kubeflow- -- --proxy http_proxy="http_proxy:port" https_proxy="https_proxy:port" no_proxy=",,127.0.0.1,localhost,/24,,.svc,.local" +``` + #### Developer Notes Any environment that can be used to access and configure the Charmed Kubeflow deployment is diff --git a/assets/test-job.yaml.j2 b/assets/test-job.yaml.j2 index 89d254f..2dd22ca 100644 --- a/assets/test-job.yaml.j2 +++ b/assets/test-job.yaml.j2 @@ -7,6 +7,9 @@ spec: template: metadata: labels: + {% if proxy %} + notebook-proxy: "true" + {% endif %} access-minio: "true" access-ml-pipeline: "true" mlflow-server-minio: "true" diff --git a/driver/conftest.py b/driver/conftest.py index aabb9fc..cf66f58 100644 --- a/driver/conftest.py +++ b/driver/conftest.py @@ -10,6 +10,17 @@ def pytest_addoption(parser: Parser): * Add a `--filter` option to (de)select test cases based on their name (see also https://docs.pytest.org/en/7.4.x/reference/reference.html#command-line-flags) """ + parser.addoption( + "--proxy", + nargs=3, + metavar=("http_proxy", "https_proxy", "no_proxy"), + help="Set a number of key-value pairs for the proxy environment variables." + " Example: " + "--proxy http_proxy='proxy:port' https_proxy='proxy:port' no_proxy='" + " If used, a PodDefault will be rendered and applied to the Kubernetes deployment." + " It is not used by default.", + action="store", + ) parser.addoption( "--filter", help="Provide a filter to (de)select tests cases based on their name. The filter follows" diff --git a/driver/test_kubeflow_workloads.py b/driver/test_kubeflow_workloads.py index 203a995..5d9dd22 100644 --- a/driver/test_kubeflow_workloads.py +++ b/driver/test_kubeflow_workloads.py @@ -5,10 +5,15 @@ import os import subprocess from pathlib import Path +from typing import Dict import pytest from lightkube import ApiError, Client, codecs -from lightkube.generic_resource import create_global_resource, load_in_cluster_generic_resources +from lightkube.generic_resource import ( + create_global_resource, + create_namespaced_resource, + load_in_cluster_generic_resources, +) from utils import assert_namespace_active, delete_job, fetch_job_logs, wait_for_job log = logging.getLogger(__name__) @@ -34,6 +39,14 @@ PYTEST_CMD_BASE = "pytest" +PODDEFAULT_RESOURCE = create_namespaced_resource( + group="kubeflow.org", + version="v1alpha1", + kind="poddefault", + plural="poddefaults", +) +PODDEFAULT_WITH_PROXY_PATH = Path("tests") / "proxy-poddefault.yaml.j2" + @pytest.fixture(scope="session") def pytest_filter(request): @@ -83,6 +96,33 @@ def create_profile(lightkube_client): lightkube_client.delete(PROFILE_RESOURCE, name=NAMESPACE) +@pytest.fixture(scope="function") +def create_poddefaults_on_proxy(request, lightkube_client): + """Create PodDefault with proxy env variables for the Notebook inside the Job.""" + # Simply yield if the proxy flag is not set + if not request.config.getoption("proxy"): + yield + else: + log.info("Adding PodDefault with proxy settings.") + poddefault_resource = codecs.load_all_yaml( + PODDEFAULT_WITH_PROXY_PATH.read_text(), + context=proxy_context(request), + ) + # Using the first item of the list of poddefault_resource. It is a one item list. + lightkube_client.create(poddefault_resource[0], namespace=NAMESPACE) + + yield + + # delete the PodDefault at the end of the module tests + log.info("Deleting PodDefault...") + poddefault_resource = codecs.load_all_yaml( + PODDEFAULT_WITH_PROXY_PATH.read_text(), + context=proxy_context(request), + ) + poddefault_name = poddefault_resource[0].metadata.name + lightkube_client.delete(PODDEFAULT_RESOURCE, name=poddefault_name, namespace=NAMESPACE) + + @pytest.mark.abort_on_fail async def test_create_profile(lightkube_client, create_profile): """Test Profile creation. @@ -105,7 +145,9 @@ async def test_create_profile(lightkube_client, create_profile): assert_namespace_active(lightkube_client, NAMESPACE) -def test_kubeflow_workloads(lightkube_client, pytest_cmd, tests_checked_out_commit): +def test_kubeflow_workloads( + lightkube_client, pytest_cmd, tests_checked_out_commit, request, create_poddefaults_on_proxy +): """Run a K8s Job to execute the notebook tests.""" log.info(f"Starting Kubernetes Job {NAMESPACE}/{JOB_NAME} to run notebook tests...") resources = list( @@ -118,9 +160,11 @@ def test_kubeflow_workloads(lightkube_client, pytest_cmd, tests_checked_out_comm "tests_image": TESTS_IMAGE, "tests_remote_commit": tests_checked_out_commit, "pytest_cmd": pytest_cmd, + "proxy": True if request.config.getoption("proxy") else False, }, ) ) + assert len(resources) == 1, f"Expected 1 Job, got {len(resources)}!" lightkube_client.create(resources[0], namespace=NAMESPACE) @@ -140,3 +184,12 @@ def teardown_module(): """Cleanup resources.""" log.info(f"Deleting Job {NAMESPACE}/{JOB_NAME}...") delete_job(JOB_NAME, NAMESPACE) + + +def proxy_context(request) -> Dict[str, str]: + """Return a dictionary with proxy environment variables from user input.""" + proxy_context = {} + for proxy in request.config.getoption("proxy"): + key, value = proxy.split("=") + proxy_context[key] = value + return proxy_context diff --git a/tests/proxy-poddefault.yaml b/tests/proxy-poddefault.yaml deleted file mode 100644 index a1f7300..0000000 --- a/tests/proxy-poddefault.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: kubeflow.org/v1alpha1 -kind: PodDefault -metadata: - name: notebook-proxy -spec: - desc: Add proxy settings - env: - - name: HTTP_PROXY - value: : - - name: http_proxy - value: : - - name: HTTPS_PROXY - value: : - - name: https_proxy - value: : - - name: NO_PROXY - value: ,,127.0.0.1,/24,,.svc,.local - - name: no_proxy - value: ,,127.0.0.1,/24,,.svc,.local - selector: - matchLabels: - notebook-proxy: "true" diff --git a/tests/proxy-poddefault.yaml.j2 b/tests/proxy-poddefault.yaml.j2 new file mode 100644 index 0000000..c5a5868 --- /dev/null +++ b/tests/proxy-poddefault.yaml.j2 @@ -0,0 +1,45 @@ +apiVersion: kubeflow.org/v1alpha1 +kind: PodDefault +metadata: + name: notebook-proxy +spec: + desc: Add proxy settings + env: + - name: HTTP_PROXY + value: {{ http_proxy }} + - name: http_proxy + value: {{ http_proxy }} + - name: HTTPS_PROXY + value: {{ https_proxy }} + - name: https_proxy + value: {{ https_proxy }} + - name: NO_PROXY + value: {{ no_proxy }} + - name: no_proxy + value: {{ no_proxy }} + _example_env: + ################################ + # # + # EXAMPLE CONFIGURATION # + # # + ################################ + + # This is not actually functional, just serves as an example for how to configure + # the values of proxy and which ones have to be included to make things work properly. + # If you are running the UATs directly in a Notebook, please modify the above env block + # with the values that fit your specific configuration. + - name: HTTP_PROXY + value: : + - name: http_proxy + value: : + - name: HTTPS_PROXY + value: : + - name: https_proxy + value: : + - name: NO_PROXY + value: ,,127.0.0.1,localhost,/24,,.svc,.local + - name: no_proxy + value: ,,127.0.0.1,localhost,/24,,.svc,.local + selector: + matchLabels: + notebook-proxy: "true" From fe0e41ff27a5ccb25e3b24a155c7dad07433fd1a Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Tue, 27 Aug 2024 15:05:08 +0200 Subject: [PATCH 4/5] docs: remove tfjob note as canonical/training-operator#182 is fixed (#111) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c5af1ad..38fafe4 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ To run the tests behind proxy using Notebook: * katib * kserve * kfp_v2 - * training (except TFJob due to https://github.com/canonical/training-operator/issues/182) + * training #### Running using `driver` From 135396083d594ab77418029c31f6a3a8ee246a2e Mon Sep 17 00:00:00 2001 From: Noha Ihab <49988746+NohaIhab@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:29:04 +0300 Subject: [PATCH 5/5] readme: Add to proxy instructions the KServe and Knative proxy configs (#112) --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 38fafe4..ca87727 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,31 @@ tox -e kubeflow-local ### Run behind proxy +#### Prerequisites for KServe UATs + +To be able to run the KServe UATs behind proxy, first you need to configure `kserve-controller` +and `knative-serving` charms to function behind proxy. + +> [!NOTE] +> For information on how to fill out the proxy config values, see the `Running using Notebook > Prerequisites` section below. + +1. Set the `http-proxy`, `https-proxy`, and `no-proxy` configs in `kserve-controller` charm +``` +juju config kserve-controller http-proxy=: https-proxy=: no-proxy=,,127.0.0.1,localhost,/24,,.svc,.local +``` + +2. Set the `http-proxy`, `https-proxy`, and `no-proxy` configs in `knative-serving` charm +``` +juju config knative-serving http-proxy=: https-proxy=: no-proxy=,,127.0.0.1,localhost,/24,,.svc,.local +``` + +For Example: +``` +juju config knative-serving http-proxy=http://10.0.13.50:3128/ https-proxy=http://10.0.13.50:3128/ no-proxy=10.1.0.0/16,10.152.183.0/24,127.0.0.1,localhost,10.0.2.0/24,ip-10-0-2-157,.svc,.local + +juju config kserve-controller http-proxy=http://10.0.13.50:3128/ https-proxy=http://10.0.13.50:3128/ no-proxy=10.1.0.0/16,10.152.183.0/24,127.0.0.1,localhost,10.0.2.0/24,ip-10-0-2-157,.svc,.local +``` + #### Running using Notebook ##### Prerequistes