Skip to content

Commit

Permalink
[autoscaler] AWS Autoscaler CloudWatch Dashboard support (#20266)
Browse files Browse the repository at this point in the history
These changes add a set of improvements to enable automatic creation and update of CloudWatch dashboards when provisioning AWS Autoscaling clusters. Successful implementation of these improvements will allow AWS Autoscaler users to:

1. Get rapid insights into their cluster state via CloudWatch dashboards.
2. Allow users to update their CloudWatch dashboard JSON configuration files during Ray up execution time.

Notes:
1.  This PR is a follow-up PR for #18619, adds dashboard support.
  • Loading branch information
Zyiqin-Miranda authored Jan 10, 2022
1 parent 6420c75 commit 71fae21
Show file tree
Hide file tree
Showing 7 changed files with 565 additions and 242 deletions.
413 changes: 234 additions & 179 deletions python/ray/autoscaler/_private/aws/cloudwatch/cloudwatch_helper.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion python/ray/autoscaler/_private/aws/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def _create_node(self, node_config, tags, count):
"Value": v,
})
if CloudwatchHelper.cloudwatch_config_exists(self.provider_config,
"config"):
"agent"):
cwa_installed = self._check_ami_cwa_installation(node_config)
if cwa_installed:
tag_pairs.extend([{
Expand Down
2 changes: 1 addition & 1 deletion python/ray/autoscaler/_private/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def do_update(self):
from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper \
import CloudwatchHelper
CloudwatchHelper(self.provider.provider_config,
[self.node_id], self.provider.cluster_name). \
self.node_id, self.provider.cluster_name). \
update_from_config(self.is_head_node)

if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@
"metrics_collection_interval":60,
"run_as_user":"root"
},
"csm":{
"memory_limit_in_mb":20,
"port":31000
},
"logs":{
"metrics_collected": {
"prometheus": {
Expand Down Expand Up @@ -116,60 +112,72 @@
}
}
},
"metrics":{
"append_dimensions":{
"AutoScalingGroupName":"${aws:AutoScalingGroupName}",
"InstanceId":"${aws:InstanceId}"
},
"metrics_collected":{
"collectd":{
"metrics_aggregation_interval":60
},
"cpu":{
"measurement":[
"usage_active",
"usage_system",
"usage_user",
"usage_idle",
"time_active",
"time_system",
"time_user",
"time_idle"
]
},
"processes":{
"measurement":[
"processes_running",
"processes_sleeping",
"processes_zombies",
"processes_dead",
"processes_total"
],
"metrics_collection_interval":60,
"resources":[
"*"
]
},
"disk":{
"measurement":[
"disk_used_percent"
],
"metrics_collection_interval":60,
"resources":[
"*"
]
},
"mem":{
"measurement":[
"mem_used_percent"
],
"metrics_collection_interval":60
},
"statsd":{
"metrics_aggregation_interval":60,
"metrics_collection_interval":10,
"service_address":":8125"
}
}
"metrics": {
"namespace": "{cluster_name}-ray-CWAgent",
"aggregation_dimensions": [
[
"InstanceId"
]
],
"append_dimensions": {
"AutoScalingGroupName": "${aws:AutoScalingGroupName}",
"InstanceId": "${aws:InstanceId}"
},
"metrics_collected": {
"collectd": {
"metrics_aggregation_interval": 60
},
"cpu": {
"measurement": [
"usage_active",
"usage_system",
"usage_user",
"usage_idle",
"time_active",
"time_system",
"time_user",
"time_idle"
],
"resources": [
"*"
]
},
"processes": {
"measurement": [
"processes_running",
"processes_sleeping",
"processes_zombies",
"processes_dead",
"processes_total"
],
"metrics_collection_interval": 60,
"resources": [
"*"
]
},
"disk": {
"measurement": [
"disk_used_percent"
],
"metrics_collection_interval": 60,
"resources": [
"/"
]
},
"mem": {
"measurement": [
"mem_used_percent"
],
"metrics_collection_interval": 60,
"resources": [
"*"
]
},
"statsd": {
"metrics_aggregation_interval": 60,
"metrics_collection_interval": 10,
"service_address": ":8125"
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
[
{
"type":"explorer",
"x":12,
"y":18,
"width":12,
"height":6,
"properties": {
"metrics": [
{
"metricName": "CPUUtilization",
"resourceType": "AWS::EC2::Instance",
"stat": "Average"
}
],
"aggregateBy": {
"key": "*",
"func": "SUM"
},
"labels": [
{
"key": "cloudwatch-agent-installed",
"value": "True"
},
{
"key": "ray-cluster-name",
"value": "{cluster_name}"
}
],
"widgetOptions": {
"legend": {
"position": "bottom"
},
"view": "timeSeries",
"stacked": false,
"rowsPerPage": 1,
"widgetsPerRow": 1
},
"title":"Cluster CPU Utilization"
}
},
{
"type":"explorer",
"x":0,
"y":18,
"width":12,
"height":6,
"properties": {
"metrics": [
{
"metricName": "CPUUtilization",
"resourceType": "AWS::EC2::Instance",
"stat": "Average"
}
],
"aggregateBy": {
"key": "*",
"func": "AVG"
},
"labels": [
{
"key": "cloudwatch-agent-installed",
"value": "True"
},
{
"key": "ray-cluster-name",
"value": "{cluster_name}"
}
],
"widgetOptions": {
"legend": {
"position": "bottom"
},
"view": "timeSeries",
"stacked": false,
"rowsPerPage": 1,
"widgetsPerRow": 1
},
"title":"Single Node CPU Utilization (Avg and Max)"
}
},
{
"type":"metric",
"x":12,
"y":6,
"width":12,
"height":6,
"properties":{
"view":"timeSeries",
"metrics":[
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process sum", "id": "e1" } ],
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process sum", "id": "e2" } ]
],
"region":"{region}",
"stat":"Average",
"period":60,
"title":"Cluster Processes"
}
},
{
"type":"metric",
"x":0,
"y":6,
"width":12,
"height":6,
"properties":{
"view":"timeSeries",
"metrics":[
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process average", "id": "e3" } ],
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process average", "id": "e4" } ],
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_running', 'Average', 300))", "label": "cluster running process maximum", "id": "e5" } ],
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} processes_sleeping', 'Average', 300))", "label": "cluster sleeping process maximum", "id": "e6" } ]
],
"region":"{region}",
"stat":"Average",
"period":60,
"title":"Single Node Processes (Avg and Max)"
}
},
{
"type":"metric",
"x":12,
"y":12,
"width":12,
"height":6,
"properties":{
"view":"timeSeries",
"stacked":false,
"metrics":[
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Average', 300))", "label": "cluster disk used percent sum", "id": "e7", "period": 300 } ]

],
"region":"{region}",
"title":"Cluster Disk Usage"
}
},
{
"type":"metric",
"x":0,
"y":12,
"width":12,
"height":6,
"properties":{
"view":"timeSeries",
"stacked":false,
"metrics":[
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Average', 300))", "id": "e8", "label": "cluster disk used percent average", "period": 300 } ],
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} disk_used_percent', 'Maximum', 300))", "id": "e9", "label": "cluster disk used percent maximum", "period": 300 } ]

],
"region":"{region}",
"title":"Single Node Disk Usage (Avg and Max)"
}
},
{
"type":"metric",
"x":12,
"y":18,
"width":12,
"height":6,
"properties": {
"metrics": [
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Average', 300))", "id": "e10", "label": "cluster mem used percent sum", "period": 300 } ]

],
"view": "timeSeries",
"stacked": false,
"region": "{region}",
"stat": "Maximum",
"period": 300,
"start": "-PT2H",
"end": "P0D",
"title": "Cluster Memory Usage"
}
},
{
"type":"metric",
"x":0,
"y":18,
"width":12,
"height":6,
"properties": {
"metrics": [
[ { "expression": "AVG(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Average', 300))", "id": "e11", "label": "cluster mem used percent average", "period": 300 } ],
[ { "expression": "MAX(SEARCH('{{cluster_name}-ray-CWAgent,InstanceId} mem_used_percent', 'Maximum', 300))", "id": "e12", "label": "cluster mem used percent maximum", "period": 300 } ]
],
"view": "timeSeries",
"stacked": false,
"region": "{region}",
"stat": "Maximum",
"period": 300,
"start": "-PT2H",
"end": "P0D",
"title": "Single Node Memory Usage (Avg and Max)"
}
},
{
"height": 6,
"width": 12,
"y": 0,
"x": 0,
"type": "metric",
"properties": {
"metrics": [
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-prometheus,instance} ray_node_cpu_count', 'Maximum', 300))", "label": "cluster cpu sum", "id": "e13" } ]
],
"view": "timeSeries",
"stacked": false,
"region": "{region}",
"stat": "Maximum",
"period": 300,
"start": "-PT2H",
"end": "P0D",
"title": "Cluster CPUs"
}
},
{
"height": 6,
"width": 12,
"y": 0,
"x": 12,
"type": "metric",
"properties": {
"metrics": [
[ { "expression": "SUM(SEARCH('{{cluster_name}-ray-prometheus,instance} object_store_available_memory', 'Average', 300))", "label": "cluster object store available memory sum", "id": "e14" } ]
],
"view": "timeSeries",
"stacked": false,
"region": "{region}",
"stat": "Maximum",
"period": 300,
"start": "-PT2H",
"end": "P0D",
"title": "Cluster Object Store Available Memory"
}
}
]

Loading

0 comments on commit 71fae21

Please sign in to comment.