diff --git a/src/clusterfuzz/_internal/cron/manage_vms.py b/src/clusterfuzz/_internal/cron/manage_vms.py index 160b77b244..c8be892667 100644 --- a/src/clusterfuzz/_internal/cron/manage_vms.py +++ b/src/clusterfuzz/_internal/cron/manage_vms.py @@ -19,6 +19,9 @@ import itertools import json import logging +from typing import Any +from typing import Dict +from typing import Optional from google.cloud import ndb @@ -156,6 +159,43 @@ def _template_needs_update(current_template, new_template, resource_name): return False +def _auto_healing_policy_to_dict( + policy: compute_engine_projects.AutoHealingPolicy) -> Dict[str, Any]: + """Converts `policy` into its JSON API representation.""" + return { + 'healthCheck': policy.health_check, + 'initialDelaySec': policy.initial_delay_sec, + } + + +def _update_auto_healing_policy( + instance_group: bot_manager.InstanceGroup, instance_group_body, + new_policy: Optional[compute_engine_projects.AutoHealingPolicy]): + """Updates the given instance group's auto-healing policy if need be.""" + old_policy_dict = None + policies = instance_group_body.get('autoHealingPolicies') + if policies: + old_policy_dict = policies[0] + + new_policy_dict = None + if new_policy is not None: + new_policy_dict = _auto_healing_policy_to_dict(new_policy) + + if new_policy_dict == old_policy_dict: + return + + logging.info('Updating auto-healing policy from %s to %s', old_policy_dict, + new_policy_dict) + + try: + instance_group.patch_auto_healing_policies( + auto_healing_policy=new_policy_dict, wait_for_instances=False) + except bot_manager.OperationError as e: + logging.error( + 'Failed to patch auto-healing policies for instance group %s: %s', + instance_group.name, e) + + class ClustersManager: """Manager for clusters in a project.""" @@ -253,49 +293,8 @@ def update_cluster(self, else: logging.info('No instance group size changes needed.') - # Check if needs to update autoHealingPolicies. - auto_healing_policy = {} - # Check if needs to update health check URL in autoHealingPolicies. - old_url = instance_group_body.get('auto_healing_policy', - {}).get('health_check') - new_url = cluster.auto_healing_policy.get('health_check') - - if new_url != old_url: - logging.info( - 'Updating the health check URL in auto_healing_policy' - 'of instance group %s from %s to %s.', resource_name, old_url, - new_url) - auto_healing_policy['healthCheck'] = new_url - - # Check if needs to update initial delay in autoHealingPolicies. - old_delay = instance_group_body.get('auto_healing_policy', - {}).get('initial_delay_sec') - new_delay = cluster.auto_healing_policy.get('initial_delay_sec') - - if new_delay != old_delay: - logging.info( - 'Updating the health check initial delay in auto_healing_policy' - 'of instance group %s from %s seconds to %s seconds.', - resource_name, old_delay, new_delay) - auto_healing_policy['initialDelaySec'] = new_delay - - # Send one request to update either or both if needed - if auto_healing_policy: - if new_url is None or new_delay is None: - auto_healing_policy = {} - if new_url is not None or new_delay is not None: - logging.warning( - 'Deleting auto_healing_policy ' - 'because its two values (health_check, initial_delay_sec) ' - 'should never exist independently: (%s, %s)', new_url, - new_delay) - try: - instance_group.patch_auto_healing_policies( - auto_healing_policy=auto_healing_policy, - wait_for_instances=False) - except bot_manager.OperationError as e: - logging.error('Failed to create instance group %s: %s', - resource_name, str(e)) + _update_auto_healing_policy(instance_group, instance_group_body, + cluster.auto_healing_policy) return if template_needs_update: @@ -309,7 +308,8 @@ def update_cluster(self, resource_name, resource_name, size=cpu_count, - auto_healing_policy=cluster.auto_healing_policy, + auto_healing_policy=_auto_healing_policy_to_dict( + cluster.auto_healing_policy), wait_for_instances=False) except bot_manager.OperationError as e: logging.error('Failed to create instance group %s: %s', resource_name, diff --git a/src/clusterfuzz/_internal/google_cloud_utils/compute_engine_projects.py b/src/clusterfuzz/_internal/google_cloud_utils/compute_engine_projects.py index 5b2555abdf..7ddae36ff7 100644 --- a/src/clusterfuzz/_internal/google_cloud_utils/compute_engine_projects.py +++ b/src/clusterfuzz/_internal/google_cloud_utils/compute_engine_projects.py @@ -14,7 +14,12 @@ """Load project data.""" from collections import namedtuple +import dataclasses +import logging import os +from typing import Any +from typing import Dict +from typing import Optional from clusterfuzz._internal.config import local_config from clusterfuzz._internal.system import environment @@ -54,6 +59,44 @@ def get_cluster(self, name): ['host', 'worker', 'workers_per_host']) +@dataclasses.dataclass(kw_only=True) +class AutoHealingPolicy: + """Represents an auto-healing policy for an instance group.""" + # The health check URL. + health_check: str + + # The initial delay to apply to the auto-healing policy, in seconds. + # This controls the policy but not the health check - health checks starts as + # soon as the instance is created, but the instance group manager ignores + # unhealthy instances for this delay after creation. + initial_delay_sec: int + + @classmethod + def from_config( + cls, policy: Optional[Dict[str, Any]]) -> Optional['AutoHealingPolicy']: + """Attempts to create a policy from the given yaml configuration value.""" + if policy is None: + return None + + health_check = policy.get('health_check') + initial_delay_sec = policy.get('initial_delay_sec') + if health_check is None and initial_delay_sec is None: + return None + + if health_check is None or initial_delay_sec is None: + logging.warning( + 'Ignoring auto_healing_policy ' + 'because its two values (health_check, initial_delay_sec) ' + 'should never exist independently: (%s, %s)', health_check, + initial_delay_sec) + return None + + assert isinstance(health_check, str), repr(health_check) + assert isinstance(initial_delay_sec, int), repr(initial_delay_sec) + + return cls(health_check=health_check, initial_delay_sec=initial_delay_sec) + + def _process_instance_template(instance_template): """Process instance template, normalizing some of metadata key values.""" # Load metadata items for a particular instance template. @@ -83,7 +126,8 @@ def _config_to_project(name, config): instance_count=zone['instance_count'], instance_template=zone['instance_template'], distribute=zone.get('distribute', False), - auto_healing_policy=zone.get('auto_healing_policy', {}), + auto_healing_policy=AutoHealingPolicy.from_config( + zone.get('auto_healing_policy')), worker=zone.get('worker', False), high_end=zone.get('high_end', False)))