Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix autohealing policy update logic. #4264

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 34 additions & 43 deletions src/clusterfuzz/_internal/cron/manage_vms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import itertools
import json
import logging
from typing import Optional

from google.cloud import ndb

Expand Down Expand Up @@ -156,6 +157,37 @@ def _template_needs_update(current_template, new_template, resource_name):
return False


def _update_auto_healing_policy(
instance_group: bot_manager.InstanceGroup, instance_group_body,
new_policy: Optional[compute_engine_projects.AutoHealingPolicy]):
"""Updates the given instance group's auto-healing policy if need be."""
old_policy_dict = None
policies = instance_group_body.get('autoHealingPolicies')
if policies:
old_policy_dict = policies[0]

new_policy_dict = None
if new_policy is not None:
new_policy_dict = {
'healthCheck': new_policy.health_check,
'initialDelaySec': new_policy.initial_delay_sec,
}

if new_policy_dict == old_policy_dict:
return

logging.info('Updating auto-healing policy from %s to %s', old_policy_dict,
new_policy_dict)

try:
instance_group.patch_auto_healing_policies(
auto_healing_policy=new_policy_dict, wait_for_instances=False)
except bot_manager.OperationError as e:
logging.error(
'Failed to patch auto-healing policies for instance group %s: %s',
instance_group.name, e)


class ClustersManager:
"""Manager for clusters in a project."""

Expand Down Expand Up @@ -253,49 +285,8 @@ def update_cluster(self,
else:
logging.info('No instance group size changes needed.')

# Check if needs to update autoHealingPolicies.
auto_healing_policy = {}
# Check if needs to update health check URL in autoHealingPolicies.
old_url = instance_group_body.get('auto_healing_policy',
{}).get('health_check')
new_url = cluster.auto_healing_policy.get('health_check')

if new_url != old_url:
logging.info(
'Updating the health check URL in auto_healing_policy'
'of instance group %s from %s to %s.', resource_name, old_url,
new_url)
auto_healing_policy['healthCheck'] = new_url

# Check if needs to update initial delay in autoHealingPolicies.
old_delay = instance_group_body.get('auto_healing_policy',
{}).get('initial_delay_sec')
new_delay = cluster.auto_healing_policy.get('initial_delay_sec')

if new_delay != old_delay:
logging.info(
'Updating the health check initial delay in auto_healing_policy'
'of instance group %s from %s seconds to %s seconds.',
resource_name, old_delay, new_delay)
auto_healing_policy['initialDelaySec'] = new_delay

# Send one request to update either or both if needed
if auto_healing_policy:
if new_url is None or new_delay is None:
auto_healing_policy = {}
if new_url is not None or new_delay is not None:
logging.warning(
'Deleting auto_healing_policy '
'because its two values (health_check, initial_delay_sec) '
'should never exist independently: (%s, %s)', new_url,
new_delay)
try:
instance_group.patch_auto_healing_policies(
auto_healing_policy=auto_healing_policy,
wait_for_instances=False)
except bot_manager.OperationError as e:
logging.error('Failed to create instance group %s: %s',
resource_name, str(e))
_update_auto_healing_policy(instance_group, instance_group_body,
cluster.auto_healing_policy)
return

if template_needs_update:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
"""Load project data."""

from collections import namedtuple
import dataclasses
import logging
import os
from typing import Any
from typing import Dict
from typing import Optional

from clusterfuzz._internal.config import local_config
from clusterfuzz._internal.system import environment
Expand Down Expand Up @@ -54,6 +59,44 @@ def get_cluster(self, name):
['host', 'worker', 'workers_per_host'])


@dataclasses.dataclass(kw_only=True)
class AutoHealingPolicy:
"""Represents an auto-healing policy for an instance group."""
# The health check URL.
health_check: str

# The initial delay to apply to the auto-healing policy, in seconds.
# This controls the policy but not the health check - health checks starts as
# soon as the instance is created, but the instance group manager ignores
# unhealthy instances for this delay after creation.
initial_delay_sec: int

@classmethod
def from_config(
cls, policy: Optional[Dict[str, Any]]) -> Optional['AutoHealingPolicy']:
"""Attempts to create a policy from the given yaml configuration value."""
if policy is None:
return None

health_check = policy.get('health_check')
initial_delay_sec = policy.get('initial_delay_sec')
if health_check is None and initial_delay_sec is None:
return None

if health_check is None or initial_delay_sec is None:
logging.warning(
'Ignoring auto_healing_policy '
'because its two values (health_check, initial_delay_sec) '
'should never exist independently: (%s, %s)', health_check,
initial_delay_sec)
return None

assert isinstance(health_check, str), repr(health_check)
assert isinstance(initial_delay_sec, int), repr(initial_delay_sec)

return cls(health_check=health_check, initial_delay_sec=initial_delay_sec)


def _process_instance_template(instance_template):
"""Process instance template, normalizing some of metadata key values."""
# Load metadata items for a particular instance template.
Expand Down Expand Up @@ -83,7 +126,8 @@ def _config_to_project(name, config):
instance_count=zone['instance_count'],
instance_template=zone['instance_template'],
distribute=zone.get('distribute', False),
auto_healing_policy=zone.get('auto_healing_policy', {}),
auto_healing_policy=AutoHealingPolicy.from_config(
zone.get('auto_healing_policy')),
worker=zone.get('worker', False),
high_end=zone.get('high_end', False)))

Expand Down
Loading