Skip to content

Commit

Permalink
feat: update ray resources to match the newer/cleaner torchx resource…
Browse files Browse the repository at this point in the history
…s form

this allows fixing: codeflare logs when late-attaching may not stream out gpu utilization

BREAKING CHANGE: this changes the structure of the ray form; tests may need updates. Also, any automated -y runs will require an update.
  • Loading branch information
starpit committed Mar 19, 2023
1 parent 2c12e78 commit 35260c2
Show file tree
Hide file tree
Showing 14 changed files with 75 additions and 35 deletions.
34 changes: 17 additions & 17 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions plugins/plugin-codeflare/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@
"@types/split2": "^3.2.1"
},
"dependencies": {
"@guidebooks/store": "^6.1.9",
"@guidebooks/store": "^6.2.1",
"@logdna/tail-file": "^3.0.1",
"@patternfly/react-charts": "^6.94.18",
"@patternfly/react-core": "^4.276.6",
"asciinema-player": "^3.1.0",
"chokidar": "^3.5.3",
"madwizard": "^6.4.1",
"madwizard": "^6.5.3",
"needle": "^3.2.0",
"open": "^8.4.2",
"pretty-bytes": "^6.1.0",
Expand Down
2 changes: 1 addition & 1 deletion tests/kind/profiles/non-gpu1/keep-it-simple
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
8 changes: 7 additions & 1 deletion tests/kind/profiles/non-gpu1/mcad-coscheduler
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": {
"Number of Workers": 1,
"CPUs per worker": "200m",
"GPUs per worker": 0,
"Memory per worker": "1.25Gi",
"Ephemeral Storage per worker": "5Gi"
},
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
2 changes: 1 addition & 1 deletion tests/kind/profiles/non-gpu1/mcad-default
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"200m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.25Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
8 changes: 7 additions & 1 deletion tests/kind/profiles/non-gpu1/mcad-preinstalled
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": {
"Number of Workers": 1,
"CPUs per worker": "200m",
"GPUs per worker": 0,
"Memory per worker": "1.25Gi",
"Ephemeral Storage per worker": "5Gi"
},
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
8 changes: 7 additions & 1 deletion tests/kind/profiles/non-gpu1/ray-autoscaler
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"0\",\"Maximum Workers\":\"0\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"2.5Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": {
"Number of Workers": 1,
"CPUs per worker": "200m",
"GPUs per worker": 0,
"Memory per worker": "1.25Gi",
"Ephemeral Storage per worker": "5Gi"
},
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
10 changes: 8 additions & 2 deletions tests/kind/profiles/non-gpu2/keep-it-simple
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/ray-basic\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": {
"Number of Workers": 1,
"CPUs per worker": "500m",
"GPUs per worker": 0,
"Memory per worker": "1.5Gi",
"Ephemeral Storage per worker": "5Gi"
},
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
"ml/ray/cluster/choose": "codeflare-test-ray-cluster",
"ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
"ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
}
}
}
8 changes: 7 additions & 1 deletion tests/kind/profiles/non-gpu3/keep-it-simple
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": {
"Number of Workers": 1,
"CPUs per worker": "500m",
"GPUs per worker": 0,
"Memory per worker": "1.5Gi",
"Ephemeral Storage per worker": "5Gi"
},
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
8 changes: 7 additions & 1 deletion tests/kind/profiles/non-gpu4/keep-it-simple
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": {
"Number of Workers": 1,
"CPUs per worker": "500m",
"GPUs per worker": 0,
"Memory per worker": "1.5Gi",
"Ephemeral Storage per worker": "5Gi"
},
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
2 changes: 1 addition & 1 deletion tests/kind/profiles/non-gpu5/keep-it-simple
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit-with-dashdash\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 intentionally-not-main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
2 changes: 1 addition & 1 deletion tests/kind/profiles/non-gpu6/keep-it-simple
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"ghcr.io/pytorch/torchx:0.5.0dev0\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand Down
4 changes: 2 additions & 2 deletions tests/kind/profiles/non-gpu6/mcad-default
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/torchx\",\"Base image\":\"bitnami/pytorch:1.13.1\",\"Command line prefix\":\"python3 compute_world_size/main.py\"}",
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
"s3/choose/bucket/maybe": "My data is not stored in S3",
"ml/torchx/run/resources": "{\"Number of Workers\":\"1\",\"CPUs per worker\":\"500m\",\"GPUs per worker\":\"0\",\"Memory per worker\":\"500Mi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
"kubernetes/context": "kind-codeflare-test",
"kubernetes/choose/ns": "default",
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
Expand All @@ -24,4 +24,4 @@
"kubernetes/mcad/choose/job-priority": "Default Priority",
"kubernetes/mcad/choose/scheduler": "MCAD with the Default Kubernetes Scheduler"
}
}
}
10 changes: 7 additions & 3 deletions tests/kind/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ export RAY_KUBE_CLUSTER_NAME=codeflare-test-ray-cluster
export NODE=node
export CODEFLARE_HEADLESS_HOME=${CODEFLARE_HEADLESS_HOME-$ROOT/dist/headless}

while getopts "ab:f:is:" opt
while getopts "Vab:f:is:" opt
do
case $opt in
V) VERBOSE=true; continue;;
a) FORCE_ALL=true; continue;;
f) FORCE=$OPTARG; continue;;
s) export GUIDEBOOK_STORE=$OPTARG; echo "[Test] Using store=$GUIDEBOOK_STORE"; continue;;
Expand Down Expand Up @@ -79,7 +80,10 @@ function run {
fi

local guidebook=${2-$GUIDEBOOK}
local yes=$([ -z "$FORCE_ALL" ] && [ "$FORCE" != "$profileFull" ] && [ -f "$MWPROFILES_PATH/$profile" ] && echo "--yes" || echo "")
local yes=${YES-$([ -z "$FORCE_ALL" ] && [ "$FORCE" != "$profileFull" ] && [ -f "$MWPROFILES_PATH/$profile" ] && echo "--yes" || echo "")}
if [[ -n "$VERBOSE" ]]; then
local verbose="-V"
fi

local dashdashFile="$MWPROFILES_PATH_BASE"/$variant/dashdash.txt
if [ -f "$dashdashFile" ]; then
Expand All @@ -98,7 +102,7 @@ function run {
fi

echo "[Test] Running with variant=$variant profile=$profile yes=$yes"
GUIDEBOOK_NAME="main-job-run" "$ROOT"/bin/codeflare -p $profile $yes $guidebook -- $DASHDASH | tee $OUTPUT
GUIDEBOOK_NAME="main-job-run" "$ROOT"/bin/codeflare -p $profile $verbose $yes $guidebook -- $DASHDASH | tee $OUTPUT
}

#
Expand Down

0 comments on commit 35260c2

Please sign in to comment.