From 72992785ae0e4ffb86afdb7a29e9fc7ddac365f3 Mon Sep 17 00:00:00 2001
From: Yunxuan Xiao <xiaoyunxuan1998@gmail.com>
Date: Fri, 24 Feb 2023 16:30:45 -0800
Subject: [PATCH] [Doc] Add Configuring Batch Predictor section in Predictors
 User Guides (#32436)

* [minor] add num_gpus_per_worker in BatchPredictor example

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>

* add configuring batch predictor section

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>

* Update doc/source/ray-air/doc_code/predictors.py

Co-authored-by: Justin Yu <justinvyu@anyscale.com>
Signed-off-by: Yunxuan Xiao <xiaoyunxuan1998@gmail.com>

* Update doc/source/ray-air/doc_code/predictors.py

Co-authored-by: Justin Yu <justinvyu@anyscale.com>
Signed-off-by: Yunxuan Xiao <xiaoyunxuan1998@gmail.com>

* fix typo

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>

* make examples into 3 subsections

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MBP.local.meter>

* split configure examples into 3 sections

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>

* modify BUILD file to acquire GPUs for ci workers

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MBP.local.meter>

* Change BUILD file to enable GPU tests

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>

* fix typo

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>

* not specify gpu and cpu at the same time

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MBP.local.meter>

* Update BUILD

Signed-off-by: Yunxuan Xiao <xiaoyunxuan1998@gmail.com>

* resolve tf Blas error

Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>

* resolve tf cublas error

Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>

---------

Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>
Signed-off-by: Yunxuan Xiao <xiaoyunxuan1998@gmail.com>
Signed-off-by: Yunxuan Xiao <yunxuanx@Yunxuans-MBP.local.meter>
Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>
Co-authored-by: Yunxuan Xiao <yunxuanx@Yunxuans-MacBook-Pro.local>
Co-authored-by: Justin Yu <justinvyu@anyscale.com>
Co-authored-by: Yunxuan Xiao <yunxuanx@Yunxuans-MBP.local.meter>
Signed-off-by: Edward Oakes <ed.nmi.oakes@gmail.com>
---
 doc/BUILD                                     | 18 +++++-
 doc/source/ray-air/doc_code/predictors.py     | 37 +++++++++++
 doc/source/ray-air/examples/BUILD             | 11 +++-
 .../examples/torch_image_batch_pretrained.py  |  2 +-
 doc/source/ray-air/predictors.rst             | 62 +++++++++++++++++++
 python/ray/train/batch_predictor.py           |  3 +
 6 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/doc/BUILD b/doc/BUILD
index 45da5d3a659a..8417eb6c0e01 100644
--- a/doc/BUILD
+++ b/doc/BUILD
@@ -183,7 +183,10 @@ py_test_run_all_subdirectory(
 py_test_run_all_subdirectory(
     size = "large",
     include = ["source/ray-air/doc_code/*.py"],
-    exclude = ["source/ray-air/doc_code/hf_trainer.py"],  # Too large
+    exclude = [
+        "source/ray-air/doc_code/hf_trainer.py",  # Too large
+        "source/ray-air/doc_code/predictors.py",
+    ],
     extra_srcs = [],
     tags = ["exclusive", "team:ml"],
 )
@@ -211,3 +214,16 @@ py_test_run_all_subdirectory(
     extra_srcs = [],
     tags = ["exclusive", "team:ml"],
 )
+
+
+# --------------
+# Run GPU tests
+# --------------
+
+py_test_run_all_subdirectory(
+    size = "large",
+    include = ["source/ray-air/doc_code/predictors.py"],
+    exclude = [],
+    extra_srcs = [],
+    tags = ["exclusive", "team:ml", "ray_air", "gpu"],
+)
diff --git a/doc/source/ray-air/doc_code/predictors.py b/doc/source/ray-air/doc_code/predictors.py
index a97802b73e8e..327d5a95cd4a 100644
--- a/doc/source/ray-air/doc_code/predictors.py
+++ b/doc/source/ray-air/doc_code/predictors.py
@@ -1,6 +1,11 @@
 # flake8: noqa
 # isort: skip_file
 
+
+import os
+
+os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
+
 # __use_predictor_start__
 import numpy as np
 import tensorflow as tf
@@ -76,6 +81,38 @@ def calculate_accuracy(df):
 # Final accuracy:  0.5
 # __compute_accuracy_end__
 
+
+# __configure_batch_predictor_cpu_only_start__
+predictions = batch_predictor.predict(
+    ds,
+    feature_columns=["feature_1"],
+    min_scoring_workers=2,
+    max_scoring_workers=2,
+    num_cpus_per_worker=3,
+)
+# __configure_batch_predictor_cpu_only_end__
+
+# __configure_batch_predictor_gpu_only_start__
+
+predictions = batch_predictor.predict(
+    ds,
+    feature_columns=["feature_1"],
+    min_scoring_workers=2,
+    max_scoring_workers=2,
+    num_gpus_per_worker=1,
+)
+# __configure_batch_predictor_gpu_only_end__
+
+# __configure_batch_predictor_scaling_start__
+predictions = batch_predictor.predict(
+    ds,
+    feature_columns=["feature_1"],
+    min_scoring_workers=1,
+    max_scoring_workers=4,
+    num_cpus_per_worker=3,
+)
+# __configure_batch_predictor_scaling_end__
+
 # __pipelined_prediction_start__
 import pandas as pd
 import ray
diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD
index 19e4e1056ffb..f9ffb2e66c17 100644
--- a/doc/source/ray-air/examples/BUILD
+++ b/doc/source/ray-air/examples/BUILD
@@ -15,12 +15,21 @@ filegroup(
 py_test_run_all_subdirectory(
     size = "medium",
     include = ["*.py"],
-    exclude = [],
+    exclude = ["torch_image_batch_pretrained.py"],
     extra_srcs = [],
     data = ["//doc/source/ray-air/examples:air_examples"],
     tags = ["exclusive", "team:ml", "ray_air"],
 )
 
+py_test_run_all_subdirectory(
+    size = "medium",
+    include = ["torch_image_batch_pretrained.py"],
+    exclude = [],
+    extra_srcs = [],
+    data = ["//doc/source/ray-air/examples:air_examples"],
+    tags = ["exclusive", "team:ml", "ray_air", "gpu"],
+)
+
 
 # --------------------------------------------------------------------
 # Test all doc/source/ray-air/examples notebooks.
diff --git a/doc/source/ray-air/examples/torch_image_batch_pretrained.py b/doc/source/ray-air/examples/torch_image_batch_pretrained.py
index a6fffe077845..ce3faff9b5bb 100644
--- a/doc/source/ray-air/examples/torch_image_batch_pretrained.py
+++ b/doc/source/ray-air/examples/torch_image_batch_pretrained.py
@@ -25,4 +25,4 @@
 ckpt = TorchCheckpoint.from_model(model=model, preprocessor=preprocessor)
 
 predictor = BatchPredictor.from_checkpoint(ckpt, TorchPredictor)
-predictor.predict(dataset, batch_size=80)
+predictor.predict(dataset, batch_size=80, num_gpus_per_worker=1)
diff --git a/doc/source/ray-air/predictors.rst b/doc/source/ray-air/predictors.rst
index 99463c01b8f5..c0bd0b675a03 100644
--- a/doc/source/ray-air/predictors.rst
+++ b/doc/source/ray-air/predictors.rst
@@ -75,6 +75,68 @@ Additionally, you can compute metrics from the predictions. Do this by:
     :start-after: __compute_accuracy_start__
     :end-before: __compute_accuracy_end__
 
+
+Configuring Batch Prediction
+----------------------------
+To configure the computation resources for your `BatchPredictor`, you have to set the following parameters in `predict()`:
+
+- `min_scoring_workers` and `max_scoring_workers`
+
+  - The BatchPredictor will internally create an actor pool to autoscale the number of workers from [min, max] to execute your transforms.
+
+  - If not set, the auto-scaling range will be set to [1, inf) by default.
+
+- `num_gpus_per_worker`:
+
+  - If you want to use GPU for batch prediction, please set this parameter explicitly.
+
+  - If not specified, the BatchPredictor will perform inference on CPUs by default.
+
+- `num_cpus_per_worker`:
+
+  - Set the number of CPUs for a worker.
+
+- `separate_gpu_stage`:
+
+  - If using GPUs, whether to use separate stages for GPU inference and data preprocessing.
+
+  - Enabled by default to avoid excessive preprocessing workload on GPU workers. You may disable it if your preprocessor is very lightweight.
+
+Here are some examples:
+
+**1. Use multiple CPUs for Batch Prediction:**
+
+- If `num_gpus_per_worker` not specified, use CPUs for batch prediction by default.
+
+- Two workers with 3 CPUs each.
+
+.. literalinclude:: doc_code/predictors.py
+    :language: python
+    :start-after: __configure_batch_predictor_cpu_only_start__
+    :end-before: __configure_batch_predictor_cpu_only_end__
+
+**2. Use multiple GPUs for Batch prediction:**
+
+- Two workers, each with 1 GPU and 1 CPU (by default).
+
+.. literalinclude:: doc_code/predictors.py
+    :language: python
+    :start-after: __configure_batch_predictor_gpu_only_start__
+    :end-before: __configure_batch_predictor_gpu_only_end__
+
+**3. Configure Auto-scaling:**
+
+- Scale from 1 to 4 workers, depending on your dataset size and cluster resources.
+
+- If no min/max values are provided, `BatchPredictor` will scale from 1 to inf workers by default.
+
+.. literalinclude:: doc_code/predictors.py
+    :language: python
+    :start-after: __configure_batch_predictor_scaling_start__
+    :end-before: __configure_batch_predictor_scaling_end__
+
+
+
 Batch Inference Examples
 ------------------------
 Below, we provide examples of using common frameworks to do batch inference for different data types:
diff --git a/python/ray/train/batch_predictor.py b/python/ray/train/batch_predictor.py
index 678da9d401af..a9fe0da466ad 100644
--- a/python/ray/train/batch_predictor.py
+++ b/python/ray/train/batch_predictor.py
@@ -116,7 +116,10 @@ def predict(
             min_scoring_workers: Minimum number of scoring actors.
             max_scoring_workers: If set, specify the maximum number of scoring actors.
             num_cpus_per_worker: Number of CPUs to allocate per scoring worker.
+                Set to 1 by default.
             num_gpus_per_worker: Number of GPUs to allocate per scoring worker.
+                Set to 0 by default. If you want to use GPUs for inference, please
+                specify this parameter.
             separate_gpu_stage: If using GPUs, specifies whether to execute GPU
                 processing in a separate stage (enabled by default). This avoids
                 running expensive preprocessing steps on GPU workers.