Skip to content

Commit

Permalink
[azure][autoscaler] Fix Azure autoscaler node naming and deletion del…
Browse files Browse the repository at this point in the history
…ays (ray-project#31645)

This reverts prior changes to node naming which led to non-unique names, causing constant node refreshing
Currently the Azure autoscaler blocks on node destruction, so that was removed in this change

Related issue number
Closes ray-project#31538
Closes ray-project#25971

---------

Signed-off-by: Scott Graham <[email protected]>
Co-authored-by: Scott Graham <[email protected]>
Signed-off-by: Jack He <[email protected]>
  • Loading branch information
2 people authored and ProjectsByJackHe committed May 4, 2023
1 parent 82d3090 commit 32f57a6
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 15 deletions.
12 changes: 6 additions & 6 deletions python/ray/autoscaler/_private/_azure/azure-config-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"uniqueId": {
"clusterId": {
"type": "string",
"metadata": {
"description": "Unique string appended to resource names to isolate resources from different ray clusters."
Expand All @@ -18,12 +18,12 @@
"variables": {
"contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]",
"msiName": "[concat('ray-msi-', parameters('uniqueId'))]",
"roleAssignmentName": "[concat('ray-ra-', parameters('uniqueId'))]",
"nsgName": "[concat('ray-nsg-', parameters('uniqueId'))]",
"msiName": "[concat('ray-', parameters('clusterId'), '-msi')]",
"roleAssignmentName": "[concat('ray-', parameters('clusterId'), '-ra')]",
"nsgName": "[concat('ray-', parameters('clusterId'), '-nsg')]",
"nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]",
"vnetName": "[concat('ray-vnet-', parameters('uniqueId'))]",
"subnetName": "[concat('ray-subnet-', parameters('uniqueId'))]"
"vnetName": "[concat('ray-', parameters('clusterId'), '-vnet')]",
"subnetName": "[concat('ray-', parameters('clusterId'), '-subnet')]"
},
"resources": [
{
Expand Down
8 changes: 5 additions & 3 deletions python/ray/autoscaler/_private/_azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _configure_resource_group(config):
if "tags" in config["provider"]:
params["tags"] = config["provider"]["tags"]

logger.info("Creating/Updating Resource Group: %s", resource_group)
logger.info("Creating/Updating resource group: %s", resource_group)
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups, function_name="create_or_update"
)
Expand All @@ -76,17 +76,19 @@ def _configure_resource_group(config):
with open(template_path, "r") as template_fp:
template = json.load(template_fp)

logger.info("Using cluster name: %s", config["cluster_name"])

# set unique id for resources in this cluster
unique_id = config["provider"].get("unique_id")
if unique_id is None:
hasher = sha256()
hasher.update(config["provider"]["resource_group"].encode("utf-8"))
hasher.update(config["cluster_name"].encode("utf-8"))
unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN]
else:
unique_id = str(unique_id)
config["provider"]["unique_id"] = unique_id
logger.info("Using unique id: %s", unique_id)
cluster_id = "{}-{}".format(config["cluster_name"], unique_id)

subnet_mask = config["provider"].get("subnet_mask")
if subnet_mask is None:
Expand All @@ -101,7 +103,7 @@ def _configure_resource_group(config):
"template": template,
"parameters": {
"subnet": {"value": subnet_mask},
"uniqueId": {"value": unique_id},
"clusterId": {"value": cluster_id},
},
}
}
Expand Down
15 changes: 9 additions & 6 deletions python/ray/autoscaler/_private/_azure/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
from pathlib import Path
from threading import RLock
from uuid import uuid4

from azure.identity import DefaultAzureCredential
from azure.mgmt.compute import ComputeManagementClient
Expand All @@ -23,6 +24,7 @@
)

VM_NAME_MAX_LEN = 64
UNIQUE_ID_LEN = 4

logger = logging.getLogger(__name__)
azure_logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
Expand Down Expand Up @@ -221,10 +223,11 @@ def _create_node(self, node_config, tags, count):
config_tags.update(tags)
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name

name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node")
vm_name = "{name}-{id}".format(
name=name_tag, id=self.provider_config["unique_id"]
)
vm_name = "{node}-{unique_id}-{vm_id}".format(
node=config_tags.get(TAG_RAY_NODE_NAME, "node"),
unique_id=self.provider_config["unique_id"],
vm_id=uuid4().hex[:UNIQUE_ID_LEN],
)[:VM_NAME_MAX_LEN]
use_internal_ips = self.provider_config.get("use_internal_ips", False)

template_params = node_config["azure_arm_parameters"].copy()
Expand Down Expand Up @@ -252,7 +255,7 @@ def _create_node(self, node_config, tags, count):
)
create_or_update(
resource_group_name=resource_group,
deployment_name="ray-vm-{}".format(name_tag),
deployment_name=vm_name,
parameters=parameters,
).wait()

Expand Down Expand Up @@ -311,7 +314,7 @@ def terminate_node(self, node_id):
delete = get_azure_sdk_function(
client=self.compute_client.virtual_machines, function_name="delete"
)
delete(resource_group_name=resource_group, vm_name=node_id).wait()
delete(resource_group_name=resource_group, vm_name=node_id)
except Exception as e:
logger.warning("Failed to delete VM: {}".format(e))

Expand Down

0 comments on commit 32f57a6

Please sign in to comment.