From ee1de7fffa7b70c1dcb1bba7a679f3c34b331427 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Sat, 23 Jul 2022 01:17:25 -0700 Subject: [PATCH] [air] large tune/torch benchmark (#26763) Co-authored-by: Kai Fricke Signed-off-by: Stefan van der Kleij --- .../air_benchmarks/compute_gpu_8_g4_12xl.yaml | 15 ++++++++++++ release/release_tests.yaml | 24 +++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 release/air_tests/air_benchmarks/compute_gpu_8_g4_12xl.yaml diff --git a/release/air_tests/air_benchmarks/compute_gpu_8_g4_12xl.yaml b/release/air_tests/air_benchmarks/compute_gpu_8_g4_12xl.yaml new file mode 100644 index 000000000000..630fe5690d39 --- /dev/null +++ b/release/air_tests/air_benchmarks/compute_gpu_8_g4_12xl.yaml @@ -0,0 +1,15 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +max_workers: 7 + +head_node_type: + name: head_node + instance_type: g4dn.12xlarge + +worker_node_types: + - name: worker_node + instance_type: g4dn.12xlarge + max_workers: 7 + min_workers: 7 + use_spot: false diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 3b8cd1f6f69b..4c9c002b954c 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -414,6 +414,30 @@ alert: default +- name: air_benchmark_tune_torch_mnist_large_gpu + group: AIR tests + working_dir: air_tests/air_benchmarks + + frequency: weekly + team: ml + env: staging + + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_gpu_8_g4_12xl.yaml + + run: + timeout: 3600 + script: python workloads/tune_torch_benchmark.py --num-runs 2 --num-trials 4 --num-workers 8 --use-gpu + + wait_for_nodes: + num_nodes: 8 + + type: sdk_command + file_manager: job + + alert: default + # Ray AIR distributed Tensorflow benchmarks - name: air_benchmark_tensorflow_mnist_cpu_4x1