Skip to content

Commit

Permalink
[Telemetry][Kuberentes] Distinguish Kubernetes deployment stacks (#28490
Browse files Browse the repository at this point in the history
)

Right now, Ray telemetry indicates the majority of Ray's CPU hour usage comes from Ray running within a Kubernetes cluster. However, we have no data on what method is used to deploy Ray on Kubernetes.

This PR enables Ray telemetry to distinguish between three methods of deploying Ray on Kubernetes:

KubeRay >= 0.4.0
Legacy Ray Operator with Ray >= 2.1.0
All other methods
The strategy is to have the operators inject an env variable into the Ray container's environment.
The variable identifies the deployment method.

This PR also modifies the legacy Ray operator to inject the relevant env variable.
A follow-up KubeRay PR changes the KubeRay operator to do the same thing: ray-project/kuberay#562

Signed-off-by: Dmitri Gekhtman <[email protected]>
  • Loading branch information
DmitriGekhtman authored Sep 14, 2022
1 parent c91b4a7 commit 9ff23cd
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 2 deletions.
8 changes: 8 additions & 0 deletions python/ray/_private/usage/usage_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,11 @@
EXTRA_USAGE_TAG_PREFIX = "extra_usage_tag_"

USAGE_STATS_NAMESPACE = "usage_stats"

KUBERNETES_SERVICE_HOST_ENV = "KUBERNETES_SERVICE_HOST"
KUBERAY_ENV = "RAY_USAGE_STATS_KUBERAY_IN_USE"
LEGACY_RAY_OPERATOR_ENV = "RAY_USAGE_STATS_LEGACY_OPERATOR_IN_USE"

PROVIDER_KUBERNETES_GENERIC = "kubernetes"
PROVIDER_KUBERAY = "kuberay"
PROVIDER_LEGACY_RAY_OPERATOR = "legacy_ray_operator"
13 changes: 11 additions & 2 deletions python/ray/_private/usage/usage_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,8 +757,17 @@ def get_instance_type(node_config):
except FileNotFoundError:
# It's a manually started cluster or k8s cluster
result = ClusterConfigToReport()
if "KUBERNETES_SERVICE_HOST" in os.environ:
result.cloud_provider = "kubernetes"
# Check if we're on Kubernetes
if usage_constant.KUBERNETES_SERVICE_HOST_ENV in os.environ:
# Check if we're using KubeRay >= 0.4.0.
if usage_constant.KUBERAY_ENV in os.environ:
result.cloud_provider = usage_constant.PROVIDER_KUBERAY
# Check if we're using the legacy Ray Operator with Ray >= 2.1.0.
elif usage_constant.LEGACY_RAY_OPERATOR_ENV in os.environ:
result.cloud_provider = usage_constant.PROVIDER_LEGACY_RAY_OPERATOR
# Else, we're on Kubernetes but not in either of the above categories.
else:
result.cloud_provider = usage_constant.PROVIDER_KUBERNETES_GENERIC
return result
except Exception as e:
logger.info(f"Failed to get cluster config to report {e}")
Expand Down
9 changes: 9 additions & 0 deletions python/ray/ray_operator/operator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from kubernetes.watch import Watch

from ray._private import ray_constants
from ray._private.usage import usage_constants
from ray.autoscaler._private._kubernetes import custom_objects_api
from ray.autoscaler._private._kubernetes.node_provider import head_service_selector
from ray.autoscaler._private.providers import _get_default_config
Expand Down Expand Up @@ -137,6 +138,14 @@ def get_node_types(
if name == cluster_resource["spec"]["headPodType"]:
if "labels" not in metadata:
metadata["labels"] = {}
# Insert env identifying legacy operator for telemetry.
env = node_type["node_config"]["spec"]["containers"][0].setdefault("env", [])
env.append(
{
"name": usage_constants.LEGACY_RAY_OPERATOR_ENV,
"value": "1",
}
)
node_types[name] = node_type
return node_types

Expand Down
12 changes: 12 additions & 0 deletions python/ray/tests/test_k8s_operator_unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,18 @@ def custom_resources():


class OperatorTest(unittest.TestCase):
def test_env_var_configured(self):
cr, _ = custom_resources()
config = cr_to_config(cr)
for node_type in config["available_node_types"].values():
pod_config = node_type["node_config"]
expected_env = {
"name": "RAY_USAGE_STATS_LEGACY_OPERATOR_IN_USE",
"value": "1",
}
envs = pod_config["spec"]["containers"][0]["env"]
assert expected_env in envs

def test_no_file_mounts_k8s_operator_cluster_launch(self):
with patch.object(NodeUpdaterThread, START, mock_start), patch.object(
NodeUpdaterThread, JOIN, mock_join
Expand Down
13 changes: 13 additions & 0 deletions python/ray/tests/test_usage_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,19 @@ def test_usage_lib_get_cluster_config_to_report(
assert cluster_config_to_report.head_node_instance_type is None
assert cluster_config_to_report.worker_node_instance_types is None

monkeypatch.setenv("RAY_USAGE_STATS_KUBERAY_IN_USE", "1")
cluster_config_to_report = ray_usage_lib.get_cluster_config_to_report(
tmp_path / "does_not_exist.yaml"
)
assert cluster_config_to_report.cloud_provider == "kuberay"

monkeypatch.delenv("RAY_USAGE_STATS_KUBERAY_IN_USE")
monkeypatch.setenv("RAY_USAGE_STATS_LEGACY_OPERATOR_IN_USE", "1")
cluster_config_to_report = ray_usage_lib.get_cluster_config_to_report(
tmp_path / "does_not_exist.yaml"
)
assert cluster_config_to_report.cloud_provider == "legacy_ray_operator"


@pytest.mark.skipif(
sys.platform == "win32",
Expand Down

0 comments on commit 9ff23cd

Please sign in to comment.