overcloud_reboot.yml

#
# Reboot overcloud nodes one-by-one.
#
- name: Determine RHOS version
  hosts: overcloud_nodes:!unused
  gather_facts: yes
  any_errors_fatal: true
  vars:
      discovery_types:
          - rhos_release_file
          - nova
  roles:
      - version-discovery

- name: Overcloud Reboot
  hosts: overcloud_nodes:!unused
  gather_facts: false
  any_errors_fatal: true
  serial: 1
  vars:
      ceph_client: "{{ groups['controller']|first }}"
      container_runtime: "{{ (install.version | default(undercloud_version) | openstack_release > 14) | ternary('podman', 'docker') }}"
  tasks:
      - set_fact:
            overcloud_rc: "/home/stack/{{ install.overcloud.stack }}rc"
      # Evacuate compute node before reboot
      - block:

          - name: Register storage backend type
            shell: |
                source {{ overcloud_rc }}
                openstack volume service list -f json | jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host' | sed s/hostgroup@tripleo_//
            register: storage_backend

          - name: Register hypervisor name fqdn
            shell: |
                source {{ overcloud_rc }}
                openstack hypervisor list -f value -c 'Hypervisor Hostname' | grep {{ inventory_hostname }}
            register: compute_fqdn

          - name: Register instances running on compute node
            shell: |
                source {{ overcloud_rc }}
                openstack server list --host {{ compute_fqdn.stdout }} -f json | jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .Name'
            register: compute_instances

          - name: Quiesce compute node
            shell: |
                source {{ overcloud_rc }}
                nova host-evacuate-live {% if 'ceph' not in storage_backend.stdout %}--block-migrate{% endif %} {{ compute_fqdn.stdout }}
            when: compute_instances.stdout | length > 0

          - name: Wait for compute node to get quiesced
            shell: |
                source {{ overcloud_rc }}
                openstack server list --host {{ compute_fqdn.stdout }} -f json | jq -r -c '[.[] | select(.Status | contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))] | length'
            register: compute_node_instances
            until: compute_node_instances.stdout.find("0") > -1
            retries: 30
            delay: 5
            when: compute_instances.stdout | length > 0
        delegate_to: "{{ groups['undercloud']|first }}"
        when:
          - "'compute' in group_names"
          - groups.compute|difference(groups.unused|default([]))|length > 1
          - install.postreboot_evacuate|default(False)

      # Sort out ceph client
      - name: Set ceph client to be the dedicated ceph monitor node if present
        set_fact:
          ceph_client: "{{ groups['monitors']|first }}"
        when: "'monitors' in groups"

      # Temporarily disable ceph rebalance on OSP < 14
      - name: temporarily disable ceph rebalance
        become: true
        command: "{{ item }}"
        when:
          - "'ceph' in group_names"
          - overcloud_version|int < 14
        delegate_to: "{{ ceph_client }}"
        with_items:
            - 'ceph osd set noout'
            - 'ceph osd set norebalance'

      # Temporarily disable ceph rebalance on OSP >= 14
      - name: temporarily disable ceph rebalance
        become: true
        command: "{{ item }}"
        when:
          - "'ceph' in group_names"
          - overcloud_version|int >= 14
        delegate_to: "{{ ceph_client }}"
        with_items:
            - "{{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph osd set noout"
            - "{{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph osd set norebalance"

      # check pcs cluster status
      - name: check pacemaker cluster status
        become: true
        shell: pcs status 2>&1 | grep 'cluster is not currently running on this node'
        register: pcs_active
        ignore_errors: true

      - name: check this if this is pcmk-remote node
        become: true
        shell: |
            pcs status 2>&1 | grep -w 'RemoteOnline:' | grep {{ inventory_hostname }}
        register: pcmk_remote
        ignore_errors: true
        when: pcs_active|failed

      - name: stop pcs cluster
        become: true
        command: "pcs cluster stop --request-timeout=300 {% if groups['controller']|length  < 3 %} --force {% endif %}"
        when:
            - pcs_active|failed
            - pcmk_remote|failed

      - name: reboot the node
        become: true
        shell: "sleep 5 && shutdown -r now"
        async: 1
        poll: 0
        ignore_errors: true

      - name: wait for node to go down(hypervisor)
        command: ping -c1 {{ ansible_host|default(ansible_ssh_host) }}
        register: node_down
        until: node_down.rc != 0
        retries: 60
        delay: 3
        ignore_errors: true
        delegate_to: hypervisor
        when: "'hypervisor' in groups"

      - name: wait for node to go down(undercloud)
        become: no
        command: ping -c1 {{ ansible_host|default(ansible_ssh_host) }}
        register: node_down
        until: node_down.rc != 0
        retries: 100
        delay: 3
        ignore_errors: true
        delegate_to: "{{ groups['undercloud']|first }}"
        when: "'hypervisor' not in groups"

      - name: waiting for the node to be available(undercloud)
        become: no
        wait_for:
            port: 22
            host: "{{ ansible_host }}"
            search_regex: OpenSSH
            timeout: 30
        register: uc_reachable
        delegate_to: "{{ groups['undercloud']|first }}"
        when: "'hypervisor' not in groups"
        retries: 21
        until: uc_reachable|succeeded

      - name: waiting for the node to be available(hypervisor)
        become: no
        wait_for:
            host: "{{ ansible_host }}"
            port: 22
            search_regex: OpenSSH
            delay: 30
            sleep: 15
            timeout: 360
        delegate_to: hypervisor
        when: "'hypervisor' in groups"

      - name: check node is ssh-able
        shell: id || echo "NOT_READY"
        register: id_ssh
        until: "'{{ ansible_user }}' in id_ssh.stdout"
        retries: 20
        delay: 15

      - name: check pcmk_remote node is up
        become: true
        shell: pcs status | grep -w 'RemoteOnline:'
        register: remote_online
        until: "'{{ inventory_hostname }}' in remote_online.stdout"
        retries: 6
        delay: 5
        when:
            - pcs_active|failed
            - pcmk_remote|succeeded

      - name: wait for pacemaker node to recover
        become: true
        shell: pcs status | grep -w 'Online:'
        register: pcs_status
        until: "'{{ inventory_hostname }}' in pcs_status.stdout"
        retries: 60
        delay: 5
        when:
            - pcs_active|failed
            - pcmk_remote|failed

      - name: cleanup pacemaker resource
        shell: |
            pcs resource restart {{ item }} ;
            pcs resource cleanup {{ item }}
        become: true
        ignore_errors: true
        with_items:
          - "{{ cleanup_services }}"
        when:
          - cleanup_services is defined
          - cleanup_services|default({})|length > 0
          - pcs_active|failed
          - pcmk_remote|failed

      # Todo(yprokule): enhance search patterns
      - name: check for any stopped pcs resources
        become: true
        shell: |
            pcs status | grep {{ item }} || /bin/true
        register: srv_status
        until: srv_status.stdout.find("{{ item }}") == -1
        ignore_errors: true
        retries: 36
        delay: 10
        with_items:
            - "{{ (overcloud_version|int > 11) | ternary('Stopped', 'Stopped:') }}"
            - "Starting"
            - "Promoting"
        when:
            - pcs_active|failed
            - pcmk_remote|failed
            - "'{{ install.deployment.files | basename }}' == 'virt'"

      - name: Report pacemaker status for stopped resources
        become: true
        register: fail_stop
        command: pcs status
        when:
            - srv_status|failed

      - name: Stopped resources found
        fail:
            msg: Some pacemeker resource failed to come back online after reboot
        when:
            - fail_stop is changed


      # Todo(yprokule): enhance search patterns
      - name: Check for any unmanaged pcs resources
        become: true
        shell: |
            pcs status | grep {{ item }} || /bin/true
        register: srv_status
        until: srv_status.stdout.find("{{ item }}") == -1
        ignore_errors: true
        retries: 36
        delay: 10
        with_items:
            - 'unmanaged'
        when:
            - pcs_active|failed
            - pcmk_remote|failed

      - name: Report pacemaker status for unmanaged resources
        become: true
        register: fail_unmanaged
        command: pcs status
        when:
            - srv_status|failed

      - name: Unmanged resources found
        fail:
            msg: Some pacemeker resource(s) unmanged after reboot
        when:
            - fail_unmanaged is changed

      # Re-enable ceph rebalance on OSP < 14
      - name: reenable ceph rebalance
        become: true
        command: "{{ item }}"
        when:
          - "'ceph' in group_names"
          - overcloud_version|int < 14
        delegate_to: "{{ ceph_client }}"
        with_items:
            - 'ceph osd unset noout'
            - 'ceph osd unset norebalance'

      # Re-enable ceph rebalance on OSP >= 14
      - name: reenable ceph rebalance
        become: true
        command: "{{ item }}"
        when:
          - "'ceph' in group_names"
          - overcloud_version|int >= 14
        delegate_to: "{{ ceph_client }}"
        with_items:
            - "{{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph osd unset noout"
            - "{{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph osd unset norebalance"

      - name: wait for OSDs to come back on OSP < 14
        become: true
        command: ceph pg stat
        register: active_osd
        until: active_osd.stdout.find("active+clean") > -1
        retries: 24
        delay: 15
        when:
          - "'ceph' in group_names"
          - overcloud_version|int < 14
        delegate_to: "{{ ceph_client }}"

      - name: wait for OSDs to come back on OSP >= 14
        become: true
        command: "{{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph pg stat"
        register: active_osd
        until: active_osd.stdout.find("active+clean") > -1
        retries: 24
        delay: 15
        when:
          - "'ceph' in group_names"
          - overcloud_version|int >= 14
        delegate_to: "{{ ceph_client }}"

      - name: check clock_skew on ceph cluster on OSP < 14
        become: true
        shell: |
            ceph status | grep 'Monitor clock skew detected' || echo 'ALL_GOOD'
        register: clock_skew
        when:
          - "'ceph' in group_names"
          - overcloud_version|int < 14
        delegate_to: "{{ ceph_client }}"

      - name: check clock_skew on ceph cluster on OSP >= 14
        become: true
        shell: |
            {{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph status | grep 'Monitor clock skew detected' || echo 'ALL_GOOD'
        register: clock_skew
        when:
          - "'ceph' in group_names"
          - overcloud_version|int >= 14
        delegate_to: "{{ ceph_client }}"

      - name: check ceph cluster status on OSP < 14
        become: true
        command: ceph status
        register: ceph_health
        until: ceph_health.stdout.find("HEALTH_OK") > -1
        retries: 30
        delay: 5
        delegate_to: "{{ ceph_client }}"
        when:
            - "'ceph' in group_names"
            - "groups['ceph']|difference(groups['unused']|default([]))|length >= 3"
            - "'ALL_GOOD' in clock_skew"
            - overcloud_version|int < 14

      - name: check ceph cluster status on OSP >= 14
        become: true
        command: "{{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph status"
        register: ceph_health
        until: ceph_health.stdout.find("HEALTH_OK") > -1
        retries: 30
        delay: 5
        delegate_to: "{{ ceph_client }}"
        when:
            - "'ceph' in group_names"
            - "groups['ceph']|difference(groups['unused']|default([]))|length >= 3"
            - "'ALL_GOOD' in clock_skew"
            - overcloud_version|int >= 14

      - name: check ceph cluster status with clock skew on OSP < 14
        become: true
        command: ceph status
        register: ceph_health
        until: ceph_health.stdout.find("HEALTH_WARN") > -1 or ceph_health.stdout.find("HEALTH_OK") > -1
        retries: 30
        delay: 5
        delegate_to: "{{ ceph_client }}"
        when:
            - "'ceph' in group_names"
            - "groups['ceph']|difference(groups['unused']|default([]))|length >= 3"
            - overcloud_version|int < 14

      - name: check ceph cluster status with clock skew on OSP >=14
        become: true
        command: "{{ container_runtime }} exec ceph-mon-{{ ceph_client }} ceph status"
        register: ceph_health
        until: ceph_health.stdout.find("HEALTH_WARN") > -1 or ceph_health.stdout.find("HEALTH_OK") > -1
        retries: 30
        delay: 5
        delegate_to: "{{ ceph_client }}"
        when:
            - "'ceph' in group_names"
            - "groups['ceph']|difference(groups['unused']|default([]))|length >= 3"
            - overcloud_version|int >= 14

      - name: Assert nova-compute and Open vSwitch agent are up after reboot
        block:
          - name: Register hypervisor name fqdn
            shell: |
                source {{ overcloud_rc }}
                openstack hypervisor list -f value -c 'Hypervisor Hostname' | grep {{ inventory_hostname }}
            register: compute_fqdn

          - name: Waiting for nova-compute service to go up
            shell: |
                source {{ overcloud_rc }}
                openstack compute service list --host {{ compute_fqdn.stdout }} -f json | jq -r -c '.[] | select(.Binary | contains("nova-compute")) | .State' | tr A-Z a-z
            register: nova_compute_up
            until: nova_compute_up.stdout.find('up') > -1
            retries: 60
            delay: 5

          - block:
              - name: Waiting for OVS agent on compute node to come up for OSP > 10
                shell: |
                    source {{ overcloud_rc }}
                    openstack network agent list --host {{ compute_fqdn.stdout }} -f json | jq -r -c '.[] | select(.Binary | contains("neutron-openvswitch-agent")) | .Alive'
                register: nova_ovs_up
                until: nova_ovs_up.stdout.find(':-)') > -1
                retries: 60
                delay: 5
                when: overcloud_version|int < 15

              - name: Waiting for ovn-controller agent on compute node to come up
                shell: |
                    source {{ overcloud_rc }}
                    openstack network agent list --host {{ compute_fqdn.stdout }} -f json | jq -r -c '.[] | select(.Binary | contains("ovn-controller")) | .Alive'
                register: ovn_controller_up
                until: ovn_controller_up.stdout.find(':-)') > -1
                retries: 60
                delay: 5
                when: overcloud_version|int >= 15

              - name: Waiting for networking-ovn-metadata-agent agent on compute node to come up
                shell: |
                    source {{ overcloud_rc }}
                    openstack network agent list --host {{ compute_fqdn.stdout }} -f json | jq -r -c '.[] | select(.Binary | contains("networking-ovn-metadata-agent")) | .Alive'
                register: ovn_meta_up
                until: ovn_meta_up.stdout.find(':-)') > -1
                retries: 60
                delay: 5
                when: overcloud_version|int >= 15
            when: overcloud_version|int > 10

          - name: Waiting for OVS agent on compute node to come up for OSP == 10
            shell: |
                source {{ overcloud_rc }}
                openstack network agent list -f json | jq -r -c '.[] | select(.Binary | contains("neutron-openvswitch-agent")) | select(.Host | contains("{{ compute_fqdn.stdout }}")) | .Alive'
            register: nova_ovs_up
            until: nova_ovs_up.stdout.find('true') > -1
            retries: 60
            delay: 5
            when: overcloud_version|int == 10
        delegate_to: "{{ groups['undercloud']|first }}"
        when:
          - "'compute' in group_names"

      - name: migrate instance back to {{ compute_fqdn.stdout }}
        shell: |
          source {{ overcloud_rc }}
          nova live-migration {% if 'ceph' not in storage_backend.stdout %}--block-migrate{% endif %} {{ item }} {{ compute_fqdn.stdout }}
          openstack server show {{ item }}  -f json | jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
        register: instance_host
        until: instance_host.stdout.find(compute_fqdn.stdout) > -1
        retries: 30
        delay: 5
        with_items: "{{ compute_instances.stdout_lines }}"
        delegate_to: "{{ groups['undercloud']|first }}"
        when:
          - "'compute' in group_names"
          - groups.compute|difference(groups.unused|default([]))|length > 1
          - install.postreboot_evacuate|default(False)
          - compute_instances.stdout | length > 0