Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update action 515 #516

Merged
merged 9 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 1 addition & 19 deletions bibigrid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# -- BEGIN: GENERAL CLUSTER INFORMATION --
# sshTimeout: 5 # number of attempts to connect to instances during startup with delay in between
# cloudScheduling:
# sshTimeout: 42 # like sshTimeout but during the on demand scheduling on the running cluster
# sshTimeout: 5 # like sshTimeout but during the on demand scheduling on the running cluster

## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself.
#sshPublicKeyFiles:
Expand Down Expand Up @@ -72,24 +72,6 @@
# Depends on cloud image
sshUser: # for example ubuntu

# Depends on cloud site:
# Berlin : regionOne
# Bielefeld : bielefeld
# DKFZ : regionOne
# Giessen : RegionOne
# Heidelberg : RegionOne
# Tuebingen : RegionOne
region: Bielefeld

# Depends on cloud site:
# Berlin : nova
# Bielefeld : default
# DKFZ : nova
# Giessen : nova
# Heidelberg : nova
# Tuebingen : nova
availabilityZone: default

# Depends on cloud site and project
subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/
# or network:
Expand Down
18 changes: 13 additions & 5 deletions bibigrid/core/actions/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@ def initialize_instances(self):
wait_for_services_commands = [
(wait_for_service_command.format(service=service), wait_for_service_message.format(service=service))
for service in configuration.get("waitForServices", [])]
print(wait_for_services_commands)
ssh_data["commands"] = (
wait_for_services_commands + self.ssh_add_public_key_commands + ssh_handler.ANSIBLE_SETUP)
ssh_data["filepaths"] = [(ssh_data["private_key"], ssh_handler.PRIVATE_KEY_FILE)]
Expand Down Expand Up @@ -340,12 +339,12 @@ def prepare_configurations(self):
configuration["subnet"]]
configuration["sshUser"] = self.ssh_user # is used in ansibleConfigurator

def upload_data(self):
def upload_data(self, private_key, clean_playbook=False):
"""
Configures ansible and then uploads the modified files and all necessary data to the master
@return:
"""
self.log.debug("Uploading ansible Data")
self.log.debug("Running upload_data")
if not os.path.isfile(a_rp.HOSTS_FILE):
with open(a_rp.HOSTS_FILE, 'a', encoding='utf-8') as hosts_file:
hosts_file.write("# placeholder file for worker DNS entries (see 003-dns)")
Expand All @@ -360,7 +359,14 @@ def upload_data(self):
self.log.debug(f"Starting playbook with {ansible_start}.")
commands = [ssh_handler.get_ac_command(self.providers, AC_NAME.format(
cluster_id=self.cluster_id))] + ssh_handler.ANSIBLE_START
ssh_data = {"floating_ip": self.master_ip, "private_key": KEY_FOLDER + self.key_name, "username": self.ssh_user,
if clean_playbook:
self.log.info("Cleaning Playbook")
ssh_data = {"floating_ip": self.master_ip, "private_key": private_key, "username": self.ssh_user,
"commands": [("rm -rf ~/playbook/*", "Remove Playbook")], "filepaths": [],
"gateway": self.configurations[0].get("gateway", {}), "timeout": self.ssh_timeout}
ssh_handler.execute_ssh(ssh_data=ssh_data, log=self.log)
self.log.info("Uploading Data")
ssh_data = {"floating_ip": self.master_ip, "private_key": private_key, "username": self.ssh_user,
"commands": commands, "filepaths": FILEPATHS, "gateway": self.configurations[0].get("gateway", {}),
"timeout": self.ssh_timeout}
ssh_handler.execute_ssh(ssh_data=ssh_data, log=self.log)
Expand All @@ -370,6 +376,7 @@ def start_start_server_threads(self):
Starts for each provider a start_instances thread and joins them.
@return:
"""
self.log.debug("Running start_start_server_threads")
start_server_threads = []
worker_count = 0
ansible_configurator.write_yaml(a_rp.HOSTS_FILE, {"host_entries": {}}, self.log)
Expand Down Expand Up @@ -397,6 +404,7 @@ def extended_network_configuration(self):
Configure master/vpn-worker network for a multi/hybrid cloud
@return:
"""
self.log.debug("Running extended_network_configuration")
if len(self.providers) == 1:
return

Expand Down Expand Up @@ -439,7 +447,7 @@ def create(self): # pylint: disable=too-many-branches,too-many-statements
self.start_start_server_threads()
self.extended_network_configuration()
self.initialize_instances()
self.upload_data()
self.upload_data(os.path.join(KEY_FOLDER, self.key_name))
self.log_cluster_start_info()
if self.configurations[0].get("deleteTmpKeypairAfter"):
for provider in self.providers:
Expand Down
10 changes: 5 additions & 5 deletions bibigrid/core/actions/list_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,12 @@ def get_master_access_ip(cluster_id, master_provider, log):
@param log:
@return: public ip of master
"""
# TODO: maybe move the method from list_clusters as it is now independent of list_clusters
log.info("Finding master ip for cluster %s...", cluster_id)
servers = master_provider.list_servers()
for server in servers:
master = create.MASTER_IDENTIFIER(cluster_id=cluster_id)
if server["name"].startswith(master):
return server.get("public_v4") or server.get("public_v6") or server.get("private_v4")
master = create.MASTER_IDENTIFIER(cluster_id=cluster_id)
server = master_provider.get_server(master)
if server:
return server.get("public_v4") or server.get("public_v6") or server.get("private_v4")
log.warning("Cluster %s not found on master_provider %s.", cluster_id,
master_provider.cloud_specification["identifier"])
return None
1 change: 0 additions & 1 deletion bibigrid/core/actions/terminate.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5
tmp_success = False
while not tmp_success:
try:
# TODO: Check if security group exists at all
not_found = not provider.get_security_group(security_group_name)
tmp_success = provider.delete_security_group(security_group_name)
except ConflictException:
Expand Down
40 changes: 25 additions & 15 deletions bibigrid/core/actions/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,34 @@
Module that contains methods to update the master playbook
"""

from bibigrid.core.utility import ansible_commands as a_c
from bibigrid.core.utility.handler import ssh_handler
from bibigrid.core.utility.paths import ansible_resources_path as a_rp
from bibigrid.core.utility.paths import bin_path
from bibigrid.core.actions import create
from bibigrid.core.actions.list_clusters import dict_clusters
from bibigrid.core.utility.handler import cluster_ssh_handler


def update(cluster_id, master_provider, master_configuration, log):
log.info("Starting update...")
master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider,
master_configuration, log)
def update(creator, log):
log.info(f"Starting update for cluster {creator.cluster_id}...")
master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(creator.cluster_id,
creator.providers[0],
creator.configurations[0], log)
log.info(f"Trying to update {master_ip}@{ssh_user} with key {used_private_key}")
cluster_dict = dict_clusters(creator.providers, log)
if cluster_dict[creator.cluster_id]["workers"]:
workers = [worker['name'] for worker in cluster_dict[creator.cluster_id]["workers"]]
log.warning(f"There are still workers up! {workers}")
return 1
if master_ip and ssh_user and used_private_key:
log.info("Trying to update %s@%s", master_ip, ssh_user)
ssh_handler.execute_ssh(floating_ip=master_ip, private_key=used_private_key, username=ssh_user,
log=log,
gateway=master_configuration.get("gateway", {}),
commands=[a_c.EXECUTE],
filepaths=[(a_rp.PLAYBOOK_PATH, a_rp.PLAYBOOK_PATH_REMOTE),
(bin_path.BIN_PATH, bin_path.BIN_PATH_REMOTE)])
master = create.MASTER_IDENTIFIER(cluster_id=creator.cluster_id)
server = creator.providers[0].get_server(master)
creator.master_ip = master_ip
creator.configurations[0]["private_v4"] = server["private_v4"]
creator.configurations[0]["floating_ip"] = master_ip
# TODO Test Volumes
creator.configurations[0]["volumes"] = server["volumes"]
creator.prepare_configurations()
log.log(42, f"Uploading data and executing BiBiGrid's Ansible playbook to {creator.cluster_id}")
creator.upload_data(used_private_key, clean_playbook=True)
log.log(42, f"Successfully updated cluster {creator.cluster_id}")
return 0
log.warning("One or more among master_ip, ssh_user and used_private_key are none. Aborting...")
return 1
15 changes: 11 additions & 4 deletions bibigrid/core/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def list_servers(self):
"""

@abstractmethod
def create_server(self, name, flavor, image, network, key_name=None, wait=True,
volumes=None, security_groups=None): # pylint: disable=too-many-arguments
def create_server(self, name, flavor, image, network, key_name=None, wait=True, volumes=None,
security_groups=None): # pylint: disable=too-many-arguments
"""
Creates a new server and waits for it to be accessible if wait=True. If volumes are given, they are attached.
Returns said server (dict)
Expand Down Expand Up @@ -223,8 +223,8 @@ def get_active_images(self):
return [image["name"] for image in self.get_images() if image["status"].lower() == "active"]

def get_active_flavors(self):
return [flavor["name"] for flavor in self.get_flavors()
if "legacy" not in flavor["name"].lower() and "deprecated" not in flavor["name"].lower()]
return [flavor["name"] for flavor in self.get_flavors() if
"legacy" not in flavor["name"].lower() and "deprecated" not in flavor["name"].lower()]

@abstractmethod
def set_allowed_addresses(self, id_or_ip, allowed_address_pairs):
Expand Down Expand Up @@ -273,6 +273,13 @@ def get_security_group(self, name_or_id):
@return:
"""

def get_server(self, name_or_id):
"""
Returns server if found else None.
@param name_or_id:
@return:
""" # TODO Test

def get_mount_info_from_server(self, server):
volumes = []
for server_volume in server["volumes"]:
Expand Down
7 changes: 5 additions & 2 deletions bibigrid/core/startup.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def run_action(args, configurations, config_path):
creator = create.Create(providers=providers, configurations=configurations, log=LOG, debug=args.debug,
config_path=config_path)
LOG.log(42, "Creating a new cluster takes about 10 or more minutes depending on your cloud provider "
"and your configuration. Please be patient.")
"and your configuration. Please be patient.")
exit_state = creator.create()
else:
if not args.cluster_id:
Expand All @@ -99,7 +99,10 @@ def run_action(args, configurations, config_path):
exit_state = ide.ide(args.cluster_id, providers[0], configurations[0], LOG)
elif args.update:
LOG.info("Action update selected")
exit_state = update.update(args.cluster_id, providers[0], configurations[0], LOG)
creator = create.Create(providers=providers, configurations=configurations, log=LOG,
debug=args.debug,
config_path=config_path, cluster_id=args.cluster_id)
exit_state = update.update(creator, LOG)
for provider in providers:
provider.close()
else:
Expand Down
2 changes: 1 addition & 1 deletion bibigrid/core/utility/command_line_interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def interpret_command_line():
help="Establishes a secure connection to ide. Needs cluster-id set")
actions.add_argument("-u", "--update", action='store_true', help="Updates master's playbook. "
"Needs cluster-id set, no jobs running "
"and no workers up")
"and all workers down (experimental)")
args = parser.parse_args()
needs_config = args.terminate or args.create or args.list or args.check or args.ide
if needs_config and not args.config_input:
Expand Down
8 changes: 8 additions & 0 deletions bibigrid/openstack/openstack_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,11 @@ def get_security_group(self, name_or_id):
@return:
"""
return self.conn.get_security_group(name_or_id)

def get_server(self, name_or_id):
"""
Returns server if found else None.
@param name_or_id:
@return:
"""
return self.conn.get_server(name_or_id)
26 changes: 23 additions & 3 deletions documentation/markdown/features/update.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
# Update
This feature is experimental

Updates ansible-playbook and nothing else. You cannot declare new instances or anything.
Only relevant if a fix or a new feature is added to the ansible-playbook.
In the future we will try to further enhance this feature.
Update re-uploads the playbook, updates the configuration data and executes the playbook again.

Updating the configuration data does not allow for all kinds of updates, because some changes -
like attaching volumes, would need an undo process which is not implemented. That might come in a future version.
Therefore, some keys mentioned below in [updatable](#updatable) have "(activate)" behind them.
Those keys should not be deactivated, but only activated in updates.

**Configuration keys not listed below are considered not updatable.**

## Updatable
- Ansible playbook


- workerInstances
- useMasterAsCompute
- userRoles
- cloudScheduling
- waitForServices
- features
- ide (activate)
- nfsShares (activate)
- zabbix (activate)
1 change: 0 additions & 1 deletion resources/defaults/slurm/slurm.j2
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ SlurmdLogFile=/var/log/slurm/slurmd.log
{% endif %}
{% set _ = node_groups.append(node.name) %}
{% set mem = (node.flavor.ram // 1024) * 1000 %}
# {{ node }}
NodeName={{ node.name }} SocketsPerBoard={{ node.flavor.vcpus }} CoresPerSocket=1 RealMemory={{ mem - [mem // 2, 16000] | min }} State={{node.state }} {{"Features=" + (node.features | join(",")) if node.features is defined }}# {{ node.cloud_identifier }}
{% for partition in node.partitions %}
{% if partition not in partitions %}
Expand Down
18 changes: 10 additions & 8 deletions resources/playbook/roles/bibigrid/tasks/042-slurm-server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@
owner: slurm
group: root
mode: "0600"
notify:
- slurmdbd
- slurmctld

- name: Generate random JWT Secret
command:
Expand All @@ -42,8 +39,6 @@
owner: root
group: root
mode: "0644"
notify:
- slurmrestd

- name: Create system overrides directories (slurmdbdm slurmrestd)
file:
Expand All @@ -66,9 +61,6 @@
with_items:
- slurmdbd
- slurmrestd
notify:
- slurmdbd
- slurmrestd

- name: Register Slurm users home dir
shell: "set -o pipefail && grep slurm /etc/passwd | cut -d ':' -f 6"
Expand Down Expand Up @@ -220,3 +212,13 @@
- slurmd
- slurmdbd
- slurmrestd

- name: Restart Slurm services
systemd:
name: "{{ item }}"
state: restarted
loop:
- slurmdbd
- slurmrestd
- slurmctld
- slurmd
17 changes: 5 additions & 12 deletions resources/playbook/roles/bibigrid/tasks/042-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,6 @@
with_items:
- slurmd
- slurmctld
notify:
- slurmd
- slurmctld

- name: Enable slurmctld and slurmd services
systemd:
Expand All @@ -93,9 +90,6 @@
owner: slurm
group: root
mode: 0444
notify:
- slurmctld
- slurmd

- name: Create Job Container configuration
template:
Expand All @@ -104,9 +98,6 @@
owner: slurm
group: root
mode: 0444
notify:
- slurmctld
- slurmd

- name: Slurm cgroup configuration
copy:
Expand All @@ -115,6 +106,8 @@
owner: slurm
group: root
mode: 0444
notify:
- slurmctld
- slurmd

- name: Restart slurmd
systemd:
name: slurmd
state: restarted
Loading
Loading