diff --git a/.gitignore b/.gitignore index e8c3cc34..f6c40ad0 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ resources/playbook/group_vars/ # any log files *.log +log/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/bibigrid.sh b/bibigrid.sh index 80fb82ce..48d27d13 100755 --- a/bibigrid.sh +++ b/bibigrid.sh @@ -1 +1,2 @@ +#!/bin/bash python3 -m bibigrid.core.startup "$@" \ No newline at end of file diff --git a/bibigrid.yml b/bibigrid.yml index aa99d30d..876f9e61 100644 --- a/bibigrid.yml +++ b/bibigrid.yml @@ -1,4 +1,5 @@ # See https://cloud.denbi.de/wiki/Tutorials/BiBiGrid/ (after update) + # See https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown/features/configuration.md # First configuration also holds general cluster information and must include the master. # All other configurations mustn't include another master, but exactly one vpngtw instead (keys like master). @@ -8,9 +9,10 @@ # -- BEGIN: GENERAL CLUSTER INFORMATION -- ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself. #sshPublicKeyFiles: - # - [key one] + # - [public key one] ## Volumes and snapshots that will be mounted to master + # autoMount: False # WARNING: will overwrite unidentified filesystems #masterMounts: # KEY NOT FULLY IMPLEMENTED YET # - [mount one] @@ -36,13 +38,15 @@ ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen). #useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated + # deleteTmpKeypairAfter: False + # dontUploadCredentials: False # Other keys - default False #localFS: True #localDNSlookup: True #zabbix: True #nfs: True - #ide: True # Very useful to set on True. Use `./bibigrid.sh -i [path-to-bibigrid.yml] -ide -cid [cluster-id]` to start port forwarding to access the ide. + #ide: True # A nice way to view your cluster as if you were using Visual Studio Code useMasterAsCompute: True # Currently ignored by slurm @@ -52,15 +56,17 @@ # master configuration masterInstance: type: # existing type/flavor on your cloud. See launch instance>flavor for options - image: # existing image on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/images pick an active one. Currently only ubuntu22.04 is supported + image: # existing active image on your cloud. Consider using regex to prevent image updates from breaking your running cluster # features: # list # -- END: GENERAL CLUSTER INFORMATION -- + # fallbackOnOtherImage: False # if True, most similar image by name will be picked. A regex can also be given instead. + # worker configuration #workerInstances: # - type: # existing type/flavor on your cloud. See launch instance>flavor for options - # image: # same as master + # image: # same as master. Consider using regex to prevent image updates from breaking your running cluster # count: # any number of workers you would like to create with set type, image combination # # features: # list @@ -87,6 +93,10 @@ # Depends on cloud site and project subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/ + # or network: + # gateway: # if you want to use a gateway for create. + # ip: # IP of gateway to use + # portFunction: 30000 + oct4 # variables are called: oct1.oct2.oct3.oct4 # Uncomment if no full DNS service for started instances is available. # Currently, the case in Berlin, DKFZ, Heidelberg and Tuebingen. diff --git a/bibigrid/core/actions/check.py b/bibigrid/core/actions/check.py index cd4a15a2..0ad37c7b 100644 --- a/bibigrid/core/actions/check.py +++ b/bibigrid/core/actions/check.py @@ -1,21 +1,19 @@ """ Module that acts as a wrapper and uses validate_configuration to validate given configuration """ -import logging from bibigrid.core.utility import validate_configuration -LOG = logging.getLogger("bibigrid") - -def check(configurations, providers): +def check(configurations, providers, log): """ Uses validate_configuration to validate given configuration. :param configurations: list of configurations (dicts) :param providers: list of providers + :param log: :return: """ - success = validate_configuration.ValidateConfiguration(configurations, providers).validate() + success = validate_configuration.ValidateConfiguration(configurations, providers, log).validate() check_result = "succeeded! Cluster is ready to start." if success else "failed!" - print(f"Total check {check_result}") - LOG.info("Total check returned %s.", success) + log.log(42, f"Total check {check_result}") + log.info("Total check returned %s.", success) return 0 diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index 71da041d..88c97889 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -2,7 +2,6 @@ The cluster creation (master's creation, key creation, ansible setup and execution, ...) is done here """ -import logging import os import subprocess import threading @@ -10,11 +9,13 @@ from functools import partial import paramiko +import sympy import yaml -from bibigrid.core.actions import terminate_cluster +from bibigrid.core.actions import terminate from bibigrid.core.utility import ansible_configurator from bibigrid.core.utility import id_generation +from bibigrid.core.utility import image_selection from bibigrid.core.utility.handler import ssh_handler from bibigrid.core.utility.paths import ansible_resources_path as aRP from bibigrid.core.utility.paths import bin_path as biRP @@ -25,19 +26,18 @@ PREFIX = "bibigrid" SEPARATOR = "-" PREFIX_WITH_SEP = PREFIX + SEPARATOR -LOG = logging.getLogger("bibigrid") +FILEPATHS = [(aRP.PLAYBOOK_PATH, aRP.PLAYBOOK_PATH_REMOTE), (biRP.BIN_PATH, biRP.BIN_PATH_REMOTE)] -def get_identifier(identifier, cluster_id, worker_group="", additional=""): +def get_identifier(identifier, cluster_id, additional=""): """ This method does more advanced string formatting to generate master, vpngtw and worker names @param identifier: master|vpngtw|worker @param cluster_id: id of cluster - @param worker_group: group of worker (every member of a group has same flavor/type and image) @param additional: an additional string to be added at the end @return: the generated string """ - general = PREFIX_WITH_SEP + identifier + str(worker_group) + SEPARATOR + cluster_id + general = PREFIX_WITH_SEP + identifier + SEPARATOR + cluster_id if additional or additional == 0: return general + SEPARATOR + str(additional) return general @@ -63,7 +63,7 @@ class Create: # pylint: disable=too-many-instance-attributes,too-many-arguments The class Create holds necessary methods to execute the Create-Action """ - def __init__(self, providers, configurations, config_path, debug=False): + def __init__(self, providers, configurations, config_path, log, debug=False, cluster_id=None): """ Additionally sets (unique) cluster_id, public_key_commands (to copy public keys to master) and key_name. Call create() to actually start server. @@ -73,16 +73,17 @@ def __init__(self, providers, configurations, config_path, debug=False): :param debug: Bool. If True Cluster offer shut-down after create and will ask before shutting down on errors """ + self.log = log self.providers = providers self.configurations = configurations self.debug = debug - self.cluster_id = id_generation.generate_safe_cluster_id(providers) + self.cluster_id = cluster_id or id_generation.generate_safe_cluster_id(providers) self.ssh_user = configurations[0].get("sshUser") or "ubuntu" self.ssh_add_public_key_commands = ssh_handler.get_add_ssh_public_key_commands( configurations[0].get("sshPublicKeyFiles")) self.config_path = config_path self.master_ip = None - LOG.debug("Cluster-ID: %s", self.cluster_id) + self.log.debug("Cluster-ID: %s", self.cluster_id) self.name = AC_NAME.format(cluster_id=self.cluster_id) self.key_name = KEY_NAME.format(cluster_id=self.cluster_id) self.default_security_group_name = DEFAULT_SECURITY_GROUP_NAME.format(cluster_id=self.cluster_id) @@ -91,8 +92,9 @@ def __init__(self, providers, configurations, config_path, debug=False): self.worker_counter = 0 self.vpn_counter = 0 self.thread_lock = threading.Lock() - self.use_master_with_public_ip = configurations[0].get("useMasterWithPublicIp", True) - LOG.debug("Keyname: %s", self.key_name) + self.use_master_with_public_ip = not configurations[0].get("gateway") and configurations[0].get( + "useMasterWithPublicIp", True) + self.log.debug("Keyname: %s", self.key_name) def generate_keypair(self): """ @@ -100,16 +102,18 @@ def generate_keypair(self): generate_keypair makes use of the fact that files in tmp are automatically deleted ToDo find a more pythonic way to create an ECDSA keypair See here for why using python module ECDSA wasn't successful - https://stackoverflow.com/questions/71194770/why-does-creating-ecdsa-keypairs-via-python-differ-from-ssh-keygen-t-ecdsa-and + https://stackoverflow.com/questions/71194770/why-does-creating-ecdsa-keypairs-via-python-differ-from-ssh + -keygen-t-ecdsa-and :return: """ + self.log.info("Generating keypair") # create KEY_FOLDER if it doesn't exist if not os.path.isdir(KEY_FOLDER): - LOG.info("%s not found. Creating folder.", KEY_FOLDER) + self.log.debug("%s not found. Creating folder.", KEY_FOLDER) os.mkdir(KEY_FOLDER) # generate keyfile res = subprocess.check_output(f'ssh-keygen -t ecdsa -f {KEY_FOLDER}{self.key_name} -P ""', shell=True).decode() - LOG.debug(res) + self.log.debug(res) # read private keyfile with open(f"{os.path.join(KEY_FOLDER, self.key_name)}.pub", mode="r", encoding="UTF-8") as key_file: public_key = key_file.read() @@ -127,6 +131,7 @@ def generate_security_groups(self): - default with basic rules for the cluster - wireguard when more than one provider is used (= multicloud) """ + self.log.info("Generating Security Groups") for provider, configuration in zip(self.providers, self.configurations): # create a default security group default_security_group_id = provider.create_security_group(name=self.default_security_group_name)["id"] @@ -170,10 +175,11 @@ def start_vpn_or_master_instance(self, configuration, provider): name = identifier(cluster_id=self.cluster_id, # pylint: disable=redundant-keyword-arg additional=self.vpn_counter) # pylint: disable=redundant-keyword-arg self.vpn_counter += 1 - LOG.info(f"Starting instance/server {name} on {provider.cloud_specification['identifier']}") + self.log.info(f"Starting instance/server {name} on {provider.cloud_specification['identifier']}") flavor = instance_type["type"] - image = instance_type["image"] network = configuration["network"] + image = image_selection.select_image(provider, instance_type["image"], self.log, + configuration.get("fallbackOnOtherImage")) # create a server and block until it is up and running server = provider.create_server(name=name, flavor=flavor, key_name=self.key_name, image=image, network=network, @@ -196,6 +202,7 @@ def start_vpn_or_master_instance(self, configuration, provider): provider.attach_available_floating_ip(network=external_network, server=server)["floating_ip_address"] elif identifier == MASTER_IDENTIFIER: configuration["floating_ip"] = server["private_v4"] # pylint: enable=comparison-with-callable + configuration["volumes"] = provider.get_mount_info_from_server(server) def prepare_vpn_or_master_args(self, configuration, provider): """ @@ -214,7 +221,7 @@ def prepare_vpn_or_master_args(self, configuration, provider): identifier = VPN_WORKER_IDENTIFIER volumes = [] # only master has volumes else: - LOG.warning("Configuration %s has no vpngtw or master and is therefore unreachable.", configuration) + self.log.warning("Configuration %s has no vpngtw or master and is therefore unreachable.", configuration) raise KeyError return identifier, instance_type, volumes @@ -227,11 +234,13 @@ def initialize_instances(self): self.master_ip = configuration["floating_ip"] ssh_handler.ansible_preparation(floating_ip=configuration["floating_ip"], private_key=KEY_FOLDER + self.key_name, username=self.ssh_user, - commands=self.ssh_add_public_key_commands) + commands=self.ssh_add_public_key_commands, log=self.log, + gateway=configuration.get("gateway", {})) elif configuration.get("vpnInstance"): ssh_handler.execute_ssh(floating_ip=configuration["floating_ip"], private_key=KEY_FOLDER + self.key_name, username=self.ssh_user, - commands=ssh_handler.VPN_SETUP) + commands=ssh_handler.VPN_SETUP, log=self.log, + gateway=configuration.get("gateway", {})) def prepare_volumes(self, provider, mounts): """ @@ -240,23 +249,24 @@ def prepare_volumes(self, provider, mounts): :param mounts: volumes or snapshots :return: list of pre-existing and newly created volumes """ - LOG.info("Preparing volumes") + if mounts: + self.log.info("Preparing volumes") volumes = [] for mount in mounts: volume_id = provider.get_volume_by_id_or_name(mount)["id"] if volume_id: volumes.append(volume_id) else: - LOG.debug("Volume %s does not exist. Checking for snapshot.", mount) + self.log.debug("Volume %s does not exist. Checking for snapshot.", mount) volume_id = provider.create_volume_from_snapshot(mount) if volume_id: volumes.append(volume_id) else: - LOG.warning("Mount %s is neither a snapshot nor a volume.", mount) + self.log.warning("Mount %s is neither a snapshot nor a volume.", mount) ret_volumes = set(volumes) if len(ret_volumes) < len(volumes): - LOG.warning("Identical mounts found in masterMounts list. " - "Trying to set() to save the run. Check configurations!") + self.log.warning("Identical mounts found in masterMounts list. " + "Trying to set() to save the run. Check configurations!") return ret_volumes def prepare_configurations(self): @@ -268,17 +278,19 @@ def prepare_configurations(self): for configuration, provider in zip(self.configurations, self.providers): configuration["cloud_identifier"] = provider.cloud_specification["identifier"] if not configuration.get("network"): + self.log.debug("No network found. Getting network by subnet.") configuration["network"] = provider.get_network_id_by_subnet(configuration["subnet"]) - if not configuration["network"]: - LOG.warning("Unable to set network. " - f"Subnet doesn't exist in cloud {configuration['cloud_identifier']}") + if not configuration.get("network"): + self.log.warning("Unable to set network. " + f"Subnet doesn't exist in cloud {configuration['cloud_identifier']}") raise ConfigurationException(f"Subnet doesn't exist in cloud {configuration['cloud_identifier']}") - elif not configuration.get("subnet"): - configuration["subnet"] = provider.get_subnet_ids_by_network(configuration["network"]) - if not configuration["subnet"]: - LOG.warning("Unable to set subnet. Network doesn't exist.") - raise ConfigurationException("Network doesn't exist.") - configuration["subnet_cidrs"] = provider.get_subnet_by_id_or_name(configuration["subnet"])["cidr"] + self.log.debug("Getting subnets by network.") + configuration["subnet"] = provider.get_subnet_ids_by_network(configuration["network"]) + if not configuration["subnet"]: + self.log.warning("Unable to set subnet. Network doesn't exist or has no subnets.") + raise ConfigurationException("Network doesn't exist.") + configuration["subnet_cidrs"] = [provider.get_subnet_by_id_or_name(subnet)["cidr"] for subnet in + configuration["subnet"]] configuration["sshUser"] = self.ssh_user # is used in ansibleConfigurator def upload_data(self): @@ -286,25 +298,25 @@ def upload_data(self): Configures ansible and then uploads the modified files and all necessary data to the master :return: """ + self.log.debug("Uploading ansible Data") for folder in [aRP.VARS_FOLDER, aRP.GROUP_VARS_FOLDER, aRP.HOST_VARS_FOLDER]: if not os.path.isdir(folder): - LOG.info("%s not found. Creating folder.", folder) + self.log.info("%s not found. Creating folder.", folder) os.mkdir(folder) if not os.path.isfile(aRP.HOSTS_FILE): with open(aRP.HOSTS_FILE, 'a', encoding='utf-8') as hosts_file: hosts_file.write("# placeholder file for worker DNS entries (see 003-dns)") ansible_configurator.configure_ansible_yaml(providers=self.providers, configurations=self.configurations, - cluster_id=self.cluster_id) + cluster_id=self.cluster_id, log=self.log) + if self.configurations[0].get("dontUploadCredentials"): + commands = ssh_handler.ANSIBLE_START + else: + commands = [ssh_handler.get_ac_command(self.providers, AC_NAME.format( + cluster_id=self.cluster_id))] + ssh_handler.ANSIBLE_START ssh_handler.execute_ssh(floating_ip=self.master_ip, private_key=KEY_FOLDER + self.key_name, - username=self.ssh_user, - filepaths=[(aRP.PLAYBOOK_PATH, aRP.PLAYBOOK_PATH_REMOTE), - (biRP.BIN_PATH, biRP.BIN_PATH_REMOTE)], - commands=[ - ssh_handler.get_ac_command( - self.providers, - AC_NAME.format( - cluster_id=self.cluster_id))] + ssh_handler.ANSIBLE_START) + username=self.ssh_user, filepaths=FILEPATHS, commands=commands, log=self.log, + gateway=self.configurations[0].get("gateway", {})) def start_start_instance_threads(self): """ @@ -335,8 +347,9 @@ def extended_network_configuration(self): for configuration_b in self.configurations: # ... and pick all other configuration if configuration_a != configuration_b: - LOG.info(f"{configuration_a['private_v4']} --> allowed_address_pair({configuration_a['mac_addr']}," - f"{configuration_b['subnet_cidrs']})") + self.log.info( + f"{configuration_a['private_v4']} --> allowed_address_pair({configuration_a['mac_addr']}," + f"{configuration_b['subnet_cidrs']})") # add provider_b network as allowed network allowed_addresses.append( {'ip_address': configuration_b["subnet_cidrs"], 'mac_address': configuration_a["mac_addr"]}) @@ -350,7 +363,7 @@ def extended_network_configuration(self): def create(self): # pylint: disable=too-many-branches,too-many-statements """ - Creates cluster and prints helpful cluster-info afterwards. + Creates cluster and logs helpful cluster-info afterwards. If debug is set True it offers termination after starting the cluster. :return: exit_state """ @@ -362,61 +375,75 @@ def create(self): # pylint: disable=too-many-branches,too-many-statements self.extended_network_configuration() self.initialize_instances() self.upload_data() - self.print_cluster_start_info() + self.log_cluster_start_info() + if self.configurations[0].get("deleteTmpKeypairAfter"): + for provider in self.providers: + terminate.delete_keypairs(provider=provider, tmp_keyname=self.key_name, log=self.log) + terminate.delete_local_keypairs(tmp_keyname=self.key_name, log=self.log) if self.debug: - LOG.info("DEBUG MODE: Entering termination...") - terminate_cluster.terminate_cluster(cluster_id=self.cluster_id, providers=self.providers, - debug=self.debug) + self.log.info("DEBUG MODE: Entering termination...") + terminate.terminate(cluster_id=self.cluster_id, providers=self.providers, debug=self.debug, + log=self.log) except exceptions.ConnectionException: if self.debug: - LOG.error(traceback.format_exc()) - LOG.error("Connection couldn't be established. Check Provider connection.") + self.log.error(traceback.format_exc()) + self.log.error("Connection couldn't be established. Check Provider connection.") except paramiko.ssh_exception.NoValidConnectionsError: if self.debug: - LOG.error(traceback.format_exc()) - LOG.error("SSH connection couldn't be established. Check keypair.") + self.log.error(traceback.format_exc()) + self.log.error("SSH connection couldn't be established. Check keypair.") except KeyError as exc: if self.debug: - LOG.error(traceback.format_exc()) - LOG.error(f"Tried to access dictionary key {str(exc)}, but couldn't. Please check your configurations.") + self.log.error(traceback.format_exc()) + self.log.error( + f"Tried to access dictionary key {str(exc)}, but couldn't. Please check your configurations.") except FileNotFoundError as exc: if self.debug: - LOG.error(traceback.format_exc()) - LOG.error(f"Tried to access resource files but couldn't. No such file or directory: {str(exc)}") + self.log.error(traceback.format_exc()) + self.log.error(f"Tried to access resource files but couldn't. No such file or directory: {str(exc)}") except TimeoutError as exc: if self.debug: - LOG.error(traceback.format_exc()) - LOG.error(f"Timeout while connecting to master. Maybe you are trying to create a master without " - f"public ip " - f"while not being in the same network: {str(exc)}") + self.log.error(traceback.format_exc()) + self.log.error(f"Timeout while connecting to master. Maybe you are trying to create a master without " + f"public ip " + f"while not being in the same network: {str(exc)}") except ExecutionException as exc: if self.debug: - LOG.error(traceback.format_exc()) - LOG.error(f"Execution of cmd on remote host fails: {str(exc)}") + self.log.error(traceback.format_exc()) + self.log.error(f"Execution of cmd on remote host fails: {str(exc)}") except ConfigurationException as exc: if self.debug: - LOG.error(traceback.format_exc()) - LOG.error(f"Configuration invalid: {str(exc)}") + self.log.error(traceback.format_exc()) + self.log.error(f"Configuration invalid: {str(exc)}") except Exception as exc: # pylint: disable=broad-except if self.debug: - LOG.error(traceback.format_exc()) - LOG.error(f"Unexpected error: '{str(exc)}' ({type(exc)}) Contact a developer!)") + self.log.error(traceback.format_exc()) + self.log.error(f"Unexpected error: '{str(exc)}' ({type(exc)}) Contact a developer!)") else: return 0 # will be called if no exception occurred - terminate_cluster.terminate_cluster(cluster_id=self.cluster_id, providers=self.providers, debug=self.debug) + terminate.terminate(cluster_id=self.cluster_id, providers=self.providers, log=self.log, debug=self.debug) return 1 - def print_cluster_start_info(self): + def log_cluster_start_info(self): """ - Prints helpful cluster-info: + Logs helpful cluster-info: SSH: How to connect to master via SSH Terminate: What bibigrid command is needed to terminate the created cluster - Detailed cluster info: How to print detailed info about the created cluster + Detailed cluster info: How to log detailed info about the created cluster :return: """ - print(f"Cluster {self.cluster_id} with master {self.master_ip} up and running!") - print(f"SSH: ssh -i '{KEY_FOLDER}{self.key_name}' {self.ssh_user}@{self.master_ip}") - print(f"Terminate cluster: ./bibigrid.sh -i '{self.config_path}' -t -cid {self.cluster_id}") - print(f"Detailed cluster info: ./bibigrid.sh -i '{self.config_path}' -l -cid {self.cluster_id}") + gateway = self.configurations[0].get("gateway") + ssh_ip = self.master_ip + port = None + if gateway: + octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(self.master_ip.split("."))} + port = int(sympy.sympify(gateway["portFunction"]).subs(dict(octets))) + ssh_ip = gateway["ip"] + self.log.log(42, f"Cluster {self.cluster_id} with master {self.master_ip} up and running!") + self.log.log(42, + f"SSH: ssh -i '{KEY_FOLDER}{self.key_name}' {self.ssh_user}@{ssh_ip}" + f"{f' -p {port}' if gateway else ''}") + self.log.log(42, f"Terminate cluster: ./bibigrid.sh -i '{self.config_path}' -t -cid {self.cluster_id}") + self.log.log(42, f"Detailed cluster info: ./bibigrid.sh -i '{self.config_path}' -l -cid {self.cluster_id}") if self.configurations[0].get("ide"): - print(f"IDE Port Forwarding: ./bibigrid.sh -i '{self.config_path}' -ide -cid {self.cluster_id}") + self.log.log(42, f"IDE Port Forwarding: ./bibigrid.sh -i '{self.config_path}' -ide -cid {self.cluster_id}") diff --git a/bibigrid/core/actions/ide.py b/bibigrid/core/actions/ide.py index 32c20aa9..840d970b 100644 --- a/bibigrid/core/actions/ide.py +++ b/bibigrid/core/actions/ide.py @@ -2,7 +2,6 @@ This module contains methods to establish port forwarding in order to access an ide (theia). """ -import logging import random import re import signal @@ -10,7 +9,9 @@ import sys import time import webbrowser + import sshtunnel +import sympy from bibigrid.core.utility.handler import cluster_ssh_handler @@ -20,7 +21,7 @@ LOCAL_BIND_ADDRESS = 9191 MAX_JUMP = 100 LOCALHOST = "127.0.0.1" -LOG = logging.getLogger("bibigrid") + def sigint_handler(caught_signal, frame): # pylint: disable=unused-argument @@ -49,37 +50,38 @@ def is_used(ip_address): for line in lines: is_open = re.match(rf'tcp.*{ip_address}:([0-9][0-9]*).*ESTABLISHED\s*$', line) if is_open is not None: - print(line) ports_used.append(is_open[1]) -def ide(cluster_id, master_provider, master_configuration): +def ide(cluster_id, master_provider, master_configuration, log): """ Creates a port forwarding from LOCAL_BIND_ADDRESS to REMOTE_BIND_ADDRESS from localhost to master of specified cluster @param cluster_id: cluster_id or ip @param master_provider: master's provider @param master_configuration: master's configuration + @param log: @return: """ - LOG.info("Starting port forwarding for ide") + log.info("Starting port forwarding for ide") master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider, - master_configuration) + master_configuration, log) used_local_bind_address = LOCAL_BIND_ADDRESS if master_ip and ssh_user and used_private_key: attempts = 0 + if master_configuration.get("gateway"): + octets = {f'oct{enum + 1}': int(elem) for enum, elem in enumerate(master_ip.split("."))} + port = sympy.sympify(master_configuration["gateway"]["portFunction"]).subs(dict(octets)) + gateway = (master_configuration["gateway"]["ip"], int(port)) + else: + gateway = None while attempts < 16: attempts += 1 try: - with sshtunnel.SSHTunnelForwarder( - ssh_address_or_host=master_ip, # Raspberry Pi in my network - - ssh_username=ssh_user, - ssh_pkey=used_private_key, - - local_bind_address=(LOCALHOST, used_local_bind_address), - remote_bind_address=(LOCALHOST, REMOTE_BIND_ADDRESS) - ) as server: + with sshtunnel.SSHTunnelForwarder(ssh_address_or_host=gateway or master_ip, ssh_username=ssh_user, + ssh_pkey=used_private_key, + local_bind_address=(LOCALHOST, used_local_bind_address), + remote_bind_address=(LOCALHOST, REMOTE_BIND_ADDRESS)) as server: print("CTRL+C to close port forwarding when you are done.") with server: # opens in existing window if any default program exists @@ -88,11 +90,11 @@ def ide(cluster_id, master_provider, master_configuration): time.sleep(5) except sshtunnel.HandlerSSHTunnelForwarderError: used_local_bind_address += random.randint(1, MAX_JUMP) - LOG.info("Attempt: %s. Port in use... Trying new port %s", attempts, used_local_bind_address) + log.info("Attempt: %s. Port in use... Trying new port %s", attempts, used_local_bind_address) if not master_ip: - LOG.warning("Cluster id %s doesn't match an existing cluster with a master.", cluster_id) + log.warning("Cluster id %s doesn't match an existing cluster with a master.", cluster_id) if not ssh_user: - LOG.warning("No ssh user has been specified in the first configuration.") + log.warning("No ssh user has been specified in the first configuration.") if not used_private_key: - LOG.warning("No matching sshPublicKeyFiles can be found in the first configuration or in .bibigrid") + log.warning("No matching sshPublicKeyFiles can be found in the first configuration or in .bibigrid") return 1 diff --git a/bibigrid/core/actions/list_clusters.py b/bibigrid/core/actions/list_clusters.py index 45e341a4..1965dc39 100644 --- a/bibigrid/core/actions/list_clusters.py +++ b/bibigrid/core/actions/list_clusters.py @@ -3,22 +3,22 @@ This includes a method to create a dictionary containing all running clusters and their servers. """ -import logging import pprint import re from bibigrid.core.actions import create SERVER_REGEX = re.compile(r"^bibigrid-((master)-([a-zA-Z0-9]+)|(worker|vpngtw)\d+-([a-zA-Z0-9]+)-\d+)$") -LOG = logging.getLogger("bibigrid") -def dict_clusters(providers): + +def dict_clusters(providers, log): """ Creates a dictionary containing all servers by type and provider information :param providers: list of all providers + :param log: :return: list of all clusters in yaml format """ - LOG.info("Creating cluster dictionary...") + log.info("Creating cluster dictionary...") cluster_dict = {} for provider in providers: servers = provider.list_servers() @@ -53,56 +53,59 @@ def setup(cluster_dict, cluster_id, server, provider): server["cloud_specification"] = provider.cloud_specification["identifier"] -def print_list_clusters(cluster_id, providers): +def log_list(cluster_id, providers, log): """ Calls dict_clusters and gives a visual representation of the found cluster. Detail depends on whether a cluster_id is given or not. :param cluster_id: :param providers: + :param log: :return: """ - cluster_dict = dict_clusters(providers=providers) - if cluster_id: # pylint: disable=too-many-nested-blocks + cluster_dict = dict_clusters(providers=providers, log=log) + if cluster_id: # pylint: disable=too-many-nested-blocks if cluster_dict.get(cluster_id): - LOG.info("Printing specific cluster_dictionary") - master_count, worker_count, vpn_count = get_size_overview(cluster_dict[cluster_id]) - print(f"\tCluster has {master_count} master, {vpn_count} vpngtw and {worker_count} regular workers. " - f"The cluster is spread over {vpn_count + master_count} reachable provider(s).") + log.info("Printing specific cluster_dictionary") + master_count, worker_count, vpn_count = get_size_overview(cluster_dict[cluster_id], log) + log.log(42, f"\tCluster has {master_count} master, {vpn_count} vpngtw and {worker_count} regular workers. " + f"The cluster is spread over {vpn_count + master_count} reachable provider(s).") pprint.pprint(cluster_dict[cluster_id]) else: - LOG.info("Cluster with cluster-id {cluster_id} not found.") - print(f"Cluster with cluster-id {cluster_id} not found.") + log.info("Cluster with cluster-id {cluster_id} not found.") + log.log(42, f"Cluster with cluster-id {cluster_id} not found.") else: - LOG.info("Printing overview of cluster all clusters") + log.info("Printing overview of cluster all clusters") if cluster_dict: for cluster_key_id, cluster_node_dict in cluster_dict.items(): - print(f"Cluster-ID: {cluster_key_id}") + log.log(42, f"Cluster-ID: {cluster_key_id}") master = cluster_node_dict.get('master') if master: for key in ["name", "user_id", "launched_at", "key_name", "public_v4", "public_v6", "provider"]: value = cluster_node_dict['master'].get(key) if value: - print(f"\t{key}: {value}") + log.log(42, f"\t{key}: {value}") security_groups = get_security_groups(cluster_node_dict) - print(f"\tsecurity_groups: {security_groups}") + log.log(42, f"\tsecurity_groups: {security_groups}") networks = get_networks(cluster_node_dict) - print(f"\tnetwork: {pprint.pformat(networks)}") + log.log(42, f"\tnetwork: {pprint.pformat(networks)}") else: - LOG.warning("No master for cluster: %s.", cluster_key_id) - master_count, worker_count, vpn_count = get_size_overview(cluster_node_dict) - print(f"\tCluster has {master_count} master, {vpn_count} vpngtw and {worker_count} regular workers. " - f"The cluster is spread over {vpn_count + master_count} reachable provider(s).") + log.warning("No master for cluster: %s.", cluster_key_id) + master_count, worker_count, vpn_count = get_size_overview(cluster_node_dict, log) + log.log(42, + f"\tCluster has {master_count} master, {vpn_count} vpngtw and {worker_count} regular workers. " + f"The cluster is spread over {vpn_count + master_count} reachable provider(s).") else: - print("No cluster found.") + log.log(42, "No cluster found.") return 0 -def get_size_overview(cluster_dict): +def get_size_overview(cluster_dict, log): """ :param cluster_dict: dictionary of cluster to size_overview + :param log: :return: number of masters, number of workers, number of vpns """ - LOG.info("Printing size overview") + log.info("Printing size overview") master_count = int(bool(cluster_dict.get("master"))) worker_count = len(cluster_dict.get("workers") or "") vpn_count = len(cluster_dict.get("vpngtws") or "") @@ -135,19 +138,20 @@ def get_security_groups(cluster_dict): return security_groups -def get_master_access_ip(cluster_id, master_provider): +def get_master_access_ip(cluster_id, master_provider, log): """ Returns master's ip of cluster cluster_id :param master_provider: master's provider :param cluster_id: Id of cluster + :param log: :return: public ip of master """ - LOG.info("Finding master ip for cluster %s...", cluster_id) + log.info("Finding master ip for cluster %s...", cluster_id) servers = master_provider.list_servers() for server in servers: master = create.MASTER_IDENTIFIER(cluster_id=cluster_id) if server["name"].startswith(master): return server.get("public_v4") or server.get("public_v6") or server.get("private_v4") - LOG.warning("Cluster %s not found on master_provider %s.", cluster_id, + log.warning("Cluster %s not found on master_provider %s.", cluster_id, master_provider.cloud_specification["identifier"]) return None diff --git a/bibigrid/core/actions/terminate_cluster.py b/bibigrid/core/actions/terminate.py similarity index 64% rename from bibigrid/core/actions/terminate_cluster.py rename to bibigrid/core/actions/terminate.py index a726738d..db812d7f 100644 --- a/bibigrid/core/actions/terminate_cluster.py +++ b/bibigrid/core/actions/terminate.py @@ -3,7 +3,6 @@ and application credentials used by it. """ -import logging import os import re import time @@ -11,19 +10,19 @@ from bibigrid.core.actions import create from bibigrid.models.exceptions import ConflictException -LOG = logging.getLogger("bibigrid") - -def terminate_cluster(cluster_id, providers, debug=False): +def terminate(cluster_id, providers, log, debug=False, assume_yes=False): """ Goes through all providers and gets info of all servers which name contains cluster ID. It then checks if any resources are reserved, but not used and frees them that were hold by the cluster. - :param debug if set user gets asked before termination is executed - :param providers providers - :param cluster_id: ID of cluster to terminate - :return: VOID - """ - if debug: + @param debug if set user gets asked before termination is executed + @param providers: + @param log: + @param cluster_id: ID of cluster to terminate + @param assume_yes: if set, no input will be asked, but instead yes will be assumed + @return VOID + """ + if not assume_yes and debug: if not input(f"DEBUG MODE: Any non-empty input to shutdown cluster {cluster_id}. " "Empty input to exit with cluster still alive:"): return 0 @@ -34,24 +33,25 @@ def terminate_cluster(cluster_id, providers, debug=False): cluster_keypair_state = [] cluster_security_group_state = [] tmp_keyname = create.KEY_NAME.format(cluster_id=cluster_id) - local_keypairs_deleted = delete_local_keypairs(tmp_keyname) - if local_keypairs_deleted or input(f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. " - f"This might not be your cluster. Are you sure you want to terminate it?\n" - f"Any non-empty input to shutdown cluster {cluster_id}. " - f"Empty input to exit with cluster still alive:"): + local_keypairs_deleted = delete_local_keypairs(tmp_keyname, log) + if not assume_yes and ( + local_keypairs_deleted or input(f"WARNING: No local temporary keyfiles found for cluster {cluster_id}. " + f"This might not be your cluster. Are you sure you want to terminate it?\n" + f"Any non-empty input to shutdown cluster {cluster_id}. " + f"Empty input to exit with cluster still alive:")): for provider in providers: - LOG.info("Terminating cluster %s on cloud %s", cluster_id, provider.cloud_specification['identifier']) + log.info("Terminating cluster %s on cloud %s", cluster_id, provider.cloud_specification['identifier']) server_list = provider.list_servers() - cluster_server_state += terminate_servers(server_list, cluster_id, provider) - cluster_keypair_state.append(delete_keypairs(provider, tmp_keyname)) - cluster_keypair_state.append(delete_security_groups(provider, cluster_id, security_groups)) - ac_state = delete_application_credentials(providers[0], cluster_id) + cluster_server_state += terminate_servers(server_list, cluster_id, provider, log) + cluster_keypair_state.append(delete_keypairs(provider, tmp_keyname, log)) + cluster_security_group_state.append(delete_security_groups(provider, cluster_id, security_groups, log)) + ac_state = delete_application_credentials(providers[0], cluster_id, log) terminate_output(cluster_server_state, cluster_keypair_state, cluster_security_group_state, ac_state, - cluster_id) + cluster_id, log) return 0 -def terminate_servers(server_list, cluster_id, provider): +def terminate_servers(server_list, cluster_id, provider, log): """ Terminates all servers in server_list that match the bibigrid regex. @param server_list: list of server dicts. All servers are from provider @@ -59,84 +59,87 @@ def terminate_servers(server_list, cluster_id, provider): @param provider: provider that holds all servers in server_list @return: a list of the servers' (that were to be terminated) termination states """ - LOG.info("Deleting servers on provider %s...", provider.cloud_specification['identifier']) + log.info("Deleting servers on provider %s...", provider.cloud_specification['identifier']) cluster_server_state = [] - # ^(master-{cluster_id}|worker-{cluster_id}|worker-[0-9]+-[0-9]+-{cluster_id})$ - server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}+|(worker\d+|vpngtw)-{cluster_id}+-\d+)$") + server_regex = re.compile(fr"^bibigrid-(master-{cluster_id}+|(worker|vpngtw)-{cluster_id}+-\d+)$") for server in server_list: if server_regex.match(server["name"]): - LOG.info("Trying to terminate Server %s on cloud %s.", server['name'], + log.info("Trying to terminate Server %s on cloud %s.", server['name'], provider.cloud_specification['identifier']) - cluster_server_state.append(terminate_server(provider, server)) + cluster_server_state.append(terminate_server(provider, server, log)) return cluster_server_state -def terminate_server(provider, server): +def terminate_server(provider, server, log): """ Terminates a single server and stores the termination state @param provider: the provider that holds the server @param server: the server that is to be terminated + @param log: @return: true if the server has been terminated, false else """ terminated = provider.delete_server(server["id"]) if not terminated: - LOG.warning("Unable to terminate server %s on provider %s.", server['name'], + log.warning("Unable to terminate server %s on provider %s.", server['name'], provider.cloud_specification['identifier']) else: - LOG.info("Server %s terminated on provider %s.", server['name'], provider.cloud_specification['identifier']) + log.info("Server %s terminated on provider %s.", server['name'], provider.cloud_specification['identifier']) return terminated -def delete_keypairs(provider, tmp_keyname): +def delete_keypairs(provider, tmp_keyname, log): """ Deletes keypairs from all provider @param provider: provider to delete keypair from @param tmp_keyname: BiBiGrid keyname + @param log @return: True if keypair was deleted """ - LOG.info("Deleting Keypair on provider %s...", provider.cloud_specification['identifier']) + log.info("Deleting Keypair on provider %s...", provider.cloud_specification['identifier']) deleted = provider.delete_keypair(tmp_keyname) if deleted: - LOG.info("Keypair %s deleted on provider %s.", tmp_keyname, provider.cloud_specification['identifier']) + log.info("Keypair %s deleted on provider %s.", tmp_keyname, provider.cloud_specification['identifier']) else: - LOG.warning("Unable to delete %s on provider %s.", tmp_keyname, provider.cloud_specification['identifier']) + log.warning("Unable to delete %s on provider %s.", tmp_keyname, provider.cloud_specification['identifier']) return deleted -def delete_local_keypairs(tmp_keyname): +def delete_local_keypairs(tmp_keyname, log): """ Deletes local keypairs of a cluster @param tmp_keyname: BiBiGrid keyname + @param log @return: Returns true if at least one local keyfile (pub or private) was found """ success = False - LOG.info("Deleting Keypair locally...") + log.info("Deleting Keypair locally...") tmp_keypath = os.path.join(create.KEY_FOLDER, tmp_keyname) pub_tmp_keypath = tmp_keypath + ".pub" if os.path.isfile(tmp_keypath): os.remove(tmp_keypath) success = True else: - LOG.warning(f"Unable to find private keyfile '{tmp_keypath}' locally. No local private keyfile deleted.") + log.warning(f"Unable to find private keyfile '{tmp_keypath}' locally. No local private keyfile deleted.") if os.path.isfile(pub_tmp_keypath): os.remove(pub_tmp_keypath) success = True else: - LOG.warning(f"Unable to find public keyfile '{pub_tmp_keypath}' locally. No local public keyfile deleted.") + log.warning(f"Unable to find public keyfile '{pub_tmp_keypath}' locally. No local public keyfile deleted.") return success -def delete_security_groups(provider, cluster_id, security_groups, timeout=5): +def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5): """ Delete configured security groups from provider. @param provider: current cloud provider @param cluster_id: cluster id @param timeout: how often should delete be attempted - @param has_wireguard: whether wireguard security group has been used + @param security_groups: security groups that have been used + @param log @return: True if all configured security groups can be deleted, false otherwise """ - LOG.info("Deleting security groups on provider %s...", provider.cloud_specification['identifier']) + log.info("Deleting security groups on provider %s...", provider.cloud_specification['identifier']) success = True for security_group_format in security_groups: security_group_name = security_group_format.format(cluster_id=cluster_id) @@ -147,38 +150,41 @@ def delete_security_groups(provider, cluster_id, security_groups, timeout=5): tmp_success = provider.delete_security_group(security_group_name) except ConflictException: tmp_success = False - if not tmp_success: - if attempts < timeout: - attempts += 1 - time.sleep(1+2 ** attempts) - LOG.info(f"Retrying to delete security group {security_group_name} on " - f"{provider.cloud_specification['identifier']}. Attempt {attempts}/{timeout}") - else: - LOG.error(f"Attempt to delete security group {security_group_name} on " - f"{provider.cloud_specification['identifier']} failed.") - break - LOG.info(f"Delete security_group {security_group_name} -> {tmp_success}") + if tmp_success: + break + if attempts < timeout: + attempts += 1 + time.sleep(1 + 2 ** attempts) + log.info(f"Retrying to delete security group {security_group_name} on " + f"{provider.cloud_specification['identifier']}. Attempt {attempts}/{timeout}") + else: + log.error(f"Attempt to delete security group {security_group_name} on " + f"{provider.cloud_specification['identifier']} failed.") + break + log.info(f"Delete security_group {security_group_name} -> {tmp_success}") success = success and tmp_success return success -def delete_application_credentials(master_provider, cluster_id): +def delete_application_credentials(master_provider, cluster_id, log): """ Deletes application credentials from the master_provider @param master_provider: provider that holds the master @param cluster_id: + @param log: @return: True if no cluster credential remains on the provider. Else False. """ # implement deletion auth = master_provider.cloud_specification["auth"] if not auth.get("application_credential_id") or not auth.get("application_credential_secret"): return master_provider.delete_application_credential_by_id_or_name(create.AC_NAME.format(cluster_id=cluster_id)) - LOG.info("Because you used application credentials to authenticate, " + log.info("Because you used application credentials to authenticate, " "no created application credentials need deletion.") return True -def terminate_output(cluster_server_state, cluster_keypair_state, cluster_security_group_state, ac_state, cluster_id): +def terminate_output(cluster_server_state, cluster_keypair_state, cluster_security_group_state, ac_state, cluster_id, + log): """ Logs the termination result in detail @param cluster_server_state: list of bools. Each bool stands for a server termination @@ -186,6 +192,7 @@ def terminate_output(cluster_server_state, cluster_keypair_state, cluster_securi @param cluster_security_group_state: list of bools. Each bool stands for a security group deletion @param ac_state: bool that stands for the deletion of the credentials on the master @param cluster_id: + @param log: @return: """ cluster_existed = bool(cluster_server_state) @@ -194,32 +201,30 @@ def terminate_output(cluster_server_state, cluster_keypair_state, cluster_securi cluster_security_group_deleted = all(cluster_security_group_state) if cluster_existed: if cluster_server_terminated: - LOG.info("Terminated all servers of cluster %s.", cluster_id) + log.info("Terminated all servers of cluster %s.", cluster_id) else: - LOG.warning("Unable to terminate all servers of cluster %s.", cluster_id) + log.warning("Unable to terminate all servers of cluster %s.", cluster_id) if cluster_keypair_deleted: - LOG.info("Deleted all keypairs of cluster %s.", cluster_id) + log.info("Deleted all keypairs of cluster %s.", cluster_id) else: - LOG.warning("Unable to delete all keypairs of cluster %s.", cluster_id) + log.warning("Unable to delete all keypairs of cluster %s.", cluster_id) if cluster_keypair_deleted: - LOG.info("Deleted all security groups of cluster %s.", cluster_id) + log.info("Deleted all security groups of cluster %s.", cluster_id) else: - LOG.warning("Unable to delete all security groups of cluster %s.", cluster_id) + log.warning("Unable to delete all security groups of cluster %s.", cluster_id) if cluster_server_terminated and cluster_keypair_deleted and cluster_security_group_deleted: - out = f"Successfully terminated cluster {cluster_id}." - LOG.info(out) - print(out) + log.log(42, f"Successfully terminated cluster {cluster_id}.") else: - LOG.warning("Unable to terminate cluster %s properly." + log.warning("Unable to terminate cluster %s properly." "\nAll servers terminated: %s" "\nAll keys deleted: %s" "\nAll security groups deleted: %s", cluster_id, cluster_server_terminated, cluster_keypair_deleted, cluster_security_group_deleted) if ac_state: - LOG.info("Successfully handled application credential of cluster %s.", cluster_id) + log.info("Successfully handled application credential of cluster %s.", cluster_id) else: - LOG.warning("Unable to delete application credential of cluster %s", cluster_id) + log.warning("Unable to delete application credential of cluster %s", cluster_id) else: - LOG.warning("Unable to find any servers for cluster-id %s. " + log.warning("Unable to find any servers for cluster-id %s. " "Check cluster-id and configuration.\nAll keys deleted: %s", cluster_id, cluster_keypair_deleted) diff --git a/bibigrid/core/actions/update.py b/bibigrid/core/actions/update.py index 90578de7..ed866a12 100644 --- a/bibigrid/core/actions/update.py +++ b/bibigrid/core/actions/update.py @@ -2,26 +2,24 @@ Module that contains methods to update the master playbook """ -import logging - from bibigrid.core.utility import ansible_commands as aC from bibigrid.core.utility.handler import ssh_handler from bibigrid.core.utility.paths import ansible_resources_path as aRP from bibigrid.core.utility.paths import bin_path as biRP from bibigrid.core.utility.handler import cluster_ssh_handler -LOG = logging.getLogger("bibigrid") -def update(cluster_id, master_provider, master_configuration): - LOG.info("Starting update...") +def update(cluster_id, master_provider, master_configuration, log): + log.info("Starting update...") master_ip, ssh_user, used_private_key = cluster_ssh_handler.get_ssh_connection_info(cluster_id, master_provider, - master_configuration) + master_configuration, log) if master_ip and ssh_user and used_private_key: - LOG.info("Trying to update %s@%s", master_ip, ssh_user) + log.info("Trying to update %s@%s", master_ip, ssh_user) ssh_handler.execute_ssh(floating_ip=master_ip, private_key=used_private_key, username=ssh_user, + log=log, + gateway=master_configuration.get("gateway", {}), commands=[aC.EXECUTE], filepaths=[(aRP.PLAYBOOK_PATH, aRP.PLAYBOOK_PATH_REMOTE), (biRP.BIN_PATH, biRP.BIN_PATH_REMOTE)]) return 0 - return 1 diff --git a/bibigrid/core/actions/version.py b/bibigrid/core/actions/version.py index 357f1dce..35631eff 100644 --- a/bibigrid/core/actions/version.py +++ b/bibigrid/core/actions/version.py @@ -3,4 +3,25 @@ https://www.akeeba.com/how-do-version-numbers-work.html """ -__version__ = "0.3.0" +import logging +import os + +import seedir + +from bibigrid.core.utility.handler import configuration_handler + +LOG = logging.getLogger("bibigrid") + +__version__ = "0.4.0" +RELEASE_DATE = "2023" +GIT_HUB = "https://github.com/BiBiServ/bibigrid" + + +def version(log): + log.log(42, f"BiBiGrid {__version__} ({RELEASE_DATE})\nBielefeld University\n{GIT_HUB}\n\n" + "# Configuration Folders\n") + for directory in configuration_handler.CLOUDS_YAML_PATHS: + if os.path.isdir(os.path.expanduser(directory)): + log.log(42, f"## '{directory}'\n") + dir_print = seedir.seedir(directory, exclude_folders=["keys"], printout=False) + log.log(42, dir_print) diff --git a/bibigrid/core/provider.py b/bibigrid/core/provider.py index 04af597e..160a32e1 100644 --- a/bibigrid/core/provider.py +++ b/bibigrid/core/provider.py @@ -1,9 +1,10 @@ """ Holds the abstract class Provider """ +from abc import ABC, abstractmethod -class Provider: # pylint: disable=too-many-public-methods +class Provider(ABC): # pylint: disable=too-many-public-methods """ See in detailed return value information in tests>provider>test_Provider. Make sure to register your newly implemented provider in provider_handler: name:class @@ -23,6 +24,7 @@ def __init__(self, cloud_specification): self.cloud_specification = cloud_specification # contains sensitive information! self.cloud_specification["identifier"] = self.cloud_specification['identifier'] + @abstractmethod def create_application_credential(self, name=None): """ Creates an application credential with name name @@ -30,6 +32,7 @@ def create_application_credential(self, name=None): :return: the application credential dictionary """ + @abstractmethod def delete_application_credential_by_id_or_name(self, ac_id_or_name): """ Deletes existing application credential by id or name and returns true. @@ -38,6 +41,7 @@ def delete_application_credential_by_id_or_name(self, ac_id_or_name): :return: True if deleted else false """ + @abstractmethod def get_image_by_id_or_name(self, image_id_or_name): """ Returns image that has id or name image_id_or_name @@ -45,6 +49,7 @@ def get_image_by_id_or_name(self, image_id_or_name): :return: said image (dict) or none if not found """ + @abstractmethod def get_flavor(self, instance_type): """ Returns flavor that has id or name flavor_id_or_name @@ -52,6 +57,7 @@ def get_flavor(self, instance_type): :return: said flavor (dict) or none if not found """ + @abstractmethod def get_volume_snapshot_by_id_or_name(self, snapshot_id_or_name): """ Returns snapshot that has id or name snapshot_id_or_name @@ -59,6 +65,7 @@ def get_volume_snapshot_by_id_or_name(self, snapshot_id_or_name): :return: said snapshot (dict) or none if not found """ + @abstractmethod def get_network_by_id_or_name(self, network_id_or_name): """ Returns network that has id or name network_id_or_name @@ -66,6 +73,7 @@ def get_network_by_id_or_name(self, network_id_or_name): :return: said network (dict) or none if not found """ + @abstractmethod def get_subnet_by_id_or_name(self, subnet_id_or_name): """ Returns subnet that has id or name subnet_id_or_name @@ -73,12 +81,14 @@ def get_subnet_by_id_or_name(self, subnet_id_or_name): :return: said subnet (dict) or none if not found """ + @abstractmethod def list_servers(self): """ Returns a list of all servers on logged in provider :return: said list of servers or empty list if none found """ + @abstractmethod def create_server(self, name, flavor, image, network, key_name=None, wait=True, volumes=None, security_groups=None): # pylint: disable=too-many-arguments """ @@ -95,6 +105,7 @@ def create_server(self, name, flavor, image, network, key_name=None, wait=True, :return: server (dict) """ + @abstractmethod def delete_server(self, name_or_id, delete_ips=True): """ Deletes server and floating_ip as well if delete_ips is true. The resource is then free again @@ -103,6 +114,7 @@ def delete_server(self, name_or_id, delete_ips=True): :return: True if delete succeeded, False otherwise """ + @abstractmethod def delete_keypair(self, key_name): """ Deletes keypair with key_name @@ -110,6 +122,7 @@ def delete_keypair(self, key_name): :return: True if delete succeeded, False otherwise """ + @abstractmethod def get_server_group_by_id_or_name(self, server_group_id_or_name): """ Returns server_group that has id or name server_group_id_or_name @@ -117,12 +130,14 @@ def get_server_group_by_id_or_name(self, server_group_id_or_name): :return: said server_group (dict) or none if not found """ + @abstractmethod def close(self): """ Closes connection :return: """ + @abstractmethod def create_keypair(self, name, public_key): """ Creates a new keypair with name name and public_key public_key @@ -131,6 +146,7 @@ def create_keypair(self, name, public_key): :return: """ + @abstractmethod def get_network_id_by_subnet(self, subnet): """ Gets network_id by subnet @@ -138,6 +154,7 @@ def get_network_id_by_subnet(self, subnet): :return: (str) """ + @abstractmethod def get_subnet_ids_by_network(self, network): """ Gets subnet_ids (list (str)) by network_id @@ -145,12 +162,14 @@ def get_subnet_ids_by_network(self, network): :return: subnet_ids (list (str)) """ + @abstractmethod def get_free_resources(self): """ Gets free resources. If a resource cannot be determined, assume maximum is free. :return: Dictionary containing the free resources """ + @abstractmethod def get_volume_by_id_or_name(self, name_or_id): """ Returns volume that has id or name name_or_id @@ -158,6 +177,7 @@ def get_volume_by_id_or_name(self, name_or_id): :return: said volume (dict) or none if not found """ + @abstractmethod def create_volume_from_snapshot(self, snapshot_name_or_id): """ Creates a volume from snapshot. @@ -165,6 +185,7 @@ def create_volume_from_snapshot(self, snapshot_name_or_id): :return: id of created volume or none if failed """ + @abstractmethod def get_external_network(self, network_name_or_id): """ Finds router interface with network id equal to given network and by that the external network. @@ -172,17 +193,7 @@ def get_external_network(self, network_name_or_id): :return: Corresponding external network """ - def add_auto_ip(self, server, wait=False, timeout=60, reuse=True): - """ - Add a floating IP to a server. - Will reuse floating ips or create a new one if no floating-ip is down. - :param server: the server that said floating ip will be attached to - :param wait: wait for floating-ip to be assigned - :param timeout: when to accept failing - :param reuse: if False will just create a new floating-ip and not reuse an existing down one - :return: the floating-ip - """ - + @abstractmethod def attach_available_floating_ip(self, network=None, server=None): """ Get a floating IP from a network or a pool and attach it to the server @@ -191,12 +202,14 @@ def attach_available_floating_ip(self, network=None, server=None): :return: """ + @abstractmethod def get_images(self): """ Get a generator able ot generate all images @return: A generator able ot generate all images """ + @abstractmethod def get_flavors(self): """ Get a generator able ot generate all flavors @@ -214,6 +227,7 @@ def get_active_flavors(self): return [flavor["name"] for flavor in self.get_flavors() if "legacy" not in flavor["name"].lower() and "deprecated" not in flavor["name"].lower()] + @abstractmethod def set_allowed_addresses(self, id_or_ip, allowed_address_pairs): """ Set allowed address (or CIDR) for the given network interface/port @@ -226,6 +240,7 @@ def set_allowed_addresses(self, id_or_ip, allowed_address_pairs): :return: """ + @abstractmethod def create_security_group(self, name, rules): """ Create a security group and add given rules @@ -234,14 +249,15 @@ def create_security_group(self, name, rules): :return: id of created security group """ + @abstractmethod def delete_security_group(self, name_or_id): """ Delete a security group :param name_or_id : Name or Id of the security group to be deleted :return: True if delete succeeded, False otherwise. - """ + @abstractmethod def append_rules_to_security_group(self, name_or_id, rules): """ Append firewall rules to given security group @@ -249,3 +265,13 @@ def append_rules_to_security_group(self, name_or_id, rules): :param rules: :return: """ + + def get_mount_info_from_server(self, server): + volumes = [] + for server_volume in server["volumes"]: + volume = self.get_volume_by_id_or_name(server_volume["id"]) + for attachment in volume["attachments"]: + if attachment["server_id"] == server["id"]: + volumes.append({"name": volume["name"], "device": attachment["device"]}) + break + return volumes diff --git a/bibigrid/core/startup.py b/bibigrid/core/startup.py index 2e2e208a..3a073f27 100755 --- a/bibigrid/core/startup.py +++ b/bibigrid/core/startup.py @@ -10,15 +10,14 @@ import yaml -from bibigrid.core.actions import check, create, ide, list_clusters, terminate_cluster, update, version +from bibigrid.core.actions import check, create, ide, list_clusters, terminate, update, version from bibigrid.core.utility import command_line_interpreter from bibigrid.core.utility.handler import configuration_handler, provider_handler -LOGGING_HANDLER_LIST = [logging.StreamHandler(), logging.FileHandler("bibigrid.log")] # stdout and to file VERBOSITY_LIST = [logging.WARNING, logging.INFO, logging.DEBUG] LOGGER_FORMAT = "%(asctime)s [%(levelname)s] %(message)s" - LOG = logging.getLogger("bibigrid") +logging.addLevelName(42, "PRINT") def get_cluster_id_from_mem(): @@ -48,14 +47,13 @@ def set_logger_verbosity(verbosity): capped_verbosity = min(verbosity, len(VERBOSITY_LIST) - 1) # LOG.basicConfig(format=LOGGER_FORMAT, level=VERBOSITY_LIST[capped_verbosity], # handlers=LOGGING_HANDLER_LIST) + LOG.setLevel(VERBOSITY_LIST[capped_verbosity]) - log = logging.getLogger("bibigrid") - log.setLevel(VERBOSITY_LIST[capped_verbosity]) - - log.debug(f"Logging verbosity set to {capped_verbosity}") + LOG.debug(f"Logging verbosity set to {capped_verbosity}") -def run_action(args, configurations, config_path): # pylint: disable=too-many-nested-blocks,too-many-branches +# pylint: disable=too-many-nested-blocks,too-many-branches, too-many-statements +def run_action(args, configurations, config_path): """ Uses args to decide which action will be executed and executes said action. :param args: command line arguments @@ -65,60 +63,61 @@ def run_action(args, configurations, config_path): # pylint: disable=too-many-n """ if args.version: LOG.info("Action version selected") - print(version.__version__) + version.version(LOG) return 0 start_time = time.time() exit_state = 0 try: - providers = provider_handler.get_providers(configurations) + providers = provider_handler.get_providers(configurations, LOG) if providers: - if args.list_clusters: - LOG.info("Action list_clusters selected") - exit_state = list_clusters.print_list_clusters(args.cluster_id, providers) + if args.list: + LOG.info("Action list selected") + exit_state = list_clusters.log_list(args.cluster_id, providers, LOG) elif args.check: LOG.info("Action check selected") - exit_state = check.check(configurations, providers) + exit_state = check.check(configurations, providers, LOG) elif args.create: LOG.info("Action create selected") creator = create.Create(providers=providers, configurations=configurations, + log=LOG, debug=args.debug, config_path=config_path) - print("Creating a new cluster takes about 10 or more minutes depending on your cloud provider " - "and your configuration. Be patient.") + LOG.log(42, "Creating a new cluster takes about 10 or more minutes depending on your cloud provider " + "and your configuration. Be patient.") exit_state = creator.create() else: if not args.cluster_id: args.cluster_id = get_cluster_id_from_mem() LOG.info("No cid (cluster_id) specified. Defaulting to last created cluster: %s", - args.cluster_id or 'None found') + args.cluster_id or 'None found') if args.cluster_id: - if args.terminate_cluster: - LOG.info("Action terminate_cluster selected") - exit_state = terminate_cluster.terminate_cluster(cluster_id=args.cluster_id, - providers=providers, - debug=args.debug) + if args.terminate: + LOG.info("Action terminate selected") + exit_state = terminate.terminate(cluster_id=args.cluster_id, + providers=providers, + log=LOG, + debug=args.debug) elif args.ide: LOG.info("Action ide selected") - exit_state = ide.ide(args.cluster_id, providers[0], configurations[0]) + exit_state = ide.ide(args.cluster_id, providers[0], configurations[0], LOG) elif args.update: LOG.info("Action update selected") exit_state = update.update(args.cluster_id, providers[0], configurations[0]) - else: - LOG.warning("Please make use of -cid .") for provider in providers: provider.close() else: exit_state = 1 except Exception as err: # pylint: disable=broad-except if args.debug: - traceback.print_exc() + exc_type, exc_value, exc_traceback = sys.exc_info() + LOG.error("".join(traceback.format_exception(exc_type, exc_value, exc_traceback))) else: LOG.error(err) exit_state = 2 time_in_s = time.time() - start_time - print(f"--- {math.floor(time_in_s / 60)} minutes and {round(time_in_s % 60, 2)} seconds ---") + LOG.log(42, f"--- {math.floor(time_in_s / 60)} minutes and {round(time_in_s % 60, 2)} seconds ---") return exit_state @@ -127,10 +126,12 @@ def main(): Interprets command line, sets logger, reads configuration and runs selected action. Then exits. :return: """ - logging.basicConfig(format=LOGGER_FORMAT, handlers=LOGGING_HANDLER_LIST) + logging.basicConfig(format=LOGGER_FORMAT) + # LOG.addHandler(logging.StreamHandler()) # stdout + LOG.addHandler(logging.FileHandler("bibigrid.log")) # file args = command_line_interpreter.interpret_command_line() set_logger_verbosity(args.verbose) - configurations = configuration_handler.read_configuration(args.config_input) + configurations = configuration_handler.read_configuration(LOG, args.config_input) if configurations: sys.exit(run_action(args, configurations, args.config_input)) sys.exit(1) diff --git a/bibigrid/core/startup_rest.py b/bibigrid/core/startup_rest.py new file mode 100644 index 00000000..ad737bc4 --- /dev/null +++ b/bibigrid/core/startup_rest.py @@ -0,0 +1,382 @@ +""" +Contains main method. Interprets command line, sets logging and starts corresponding action. +""" +import asyncio +import logging +import multiprocessing +import os +import subprocess + +import uvicorn +import yaml +from fastapi import FastAPI, File, UploadFile, status, Request +from fastapi.exceptions import RequestValidationError +from fastapi.responses import JSONResponse +from fastapi.testclient import TestClient +from pydantic import BaseModel + +from bibigrid.core.actions import check, create, terminate, list_clusters +from bibigrid.core.utility import id_generation +from bibigrid.core.utility.handler import provider_handler + +VERSION = "0.0.1" +DESCRIPTION = """ +BiBiGrid REST API allows you to use the most important features of [BiBiGrid](https://github.com/BiBiServ/bibigrid) +via REST. This includes: +validation, creation, termination and getting cluster information. +""" + +app = FastAPI(title="BiBiGrid REST API", description=DESCRIPTION, + summary="REST API for the cluster creation and management tool BiBiGrid.", version=VERSION) + +LOG_FOLDER = "log" +if not os.path.isdir(LOG_FOLDER): + os.mkdir(LOG_FOLDER) + +LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s" +LOG_FORMATTER = logging.Formatter(LOG_FORMAT) +LOG = logging.getLogger("bibigrid") +file_handler = logging.FileHandler(os.path.join(LOG_FOLDER, "bibigrid_rest.log")) +file_handler.setFormatter(LOG_FORMATTER) +LOG.addHandler(file_handler) +logging.addLevelName(42, "PRINT") +LOG.setLevel(logging.DEBUG) + + +# pylint: disable=too-few-public-methods +class ValidationResponseModel(BaseModel): + """ + ResponseModel for validate + """ + message: str + cluster_id: str + success: bool + + +# pylint: disable=too-few-public-methods +class CreateResponseModel(BaseModel): + """ + ResponseModel for create + """ + message: str + cluster_id: str + + +# pylint: disable=too-few-public-methods +class TerminateResponseModel(BaseModel): + """ + ResponseModel for terminate + """ + message: str + + +# pylint: disable=too-few-public-methods +class InfoResponseModel(BaseModel): + """ + ResponseModel for info + """ + workers: list + vpngtws: list + master: dict + message: str + ready: bool + + +# pylint: disable=too-few-public-methods +class LogResponseModel(BaseModel): + """ + ResponseModel for get_log + """ + message: str + log: str + + +# pylint: disable=too-few-public-methods +class ReadyResponseModel(BaseModel): + """ + ResponseModel for ready + """ + message: str + ready: bool + + +def tail(file_path, lines): + return subprocess.check_output(['tail', '-n', str(lines), file_path], universal_newlines=True) + + +def setup(cluster_id): + """ + If cluster_id is none, generates a cluster id and sets up the logger. Logger has name cluster_id and + logs to file named cluster_id .log. Returns both. + @param cluster_id: cluster_id or None + @return: tuple of cluster_id and logger + """ + cluster_id = cluster_id or id_generation.generate_cluster_id() + log = logging.getLogger(cluster_id) + log.setLevel(logging.DEBUG) + if not log.handlers: + log_handler = logging.FileHandler(os.path.join(LOG_FOLDER, f"{cluster_id}.log")) + log_handler.setFormatter(LOG_FORMATTER) + log.addHandler(log_handler) + return cluster_id, log + + +def is_up(cluster_id, log): + """ + Checks if cluster with cluster_id is up and running + @param cluster_id: + @param log: + @return: + """ + file_name = os.path.join(LOG_FOLDER, f"{cluster_id}.log") + if os.path.isfile(file_name): + log.debug(f"Log for {cluster_id} found.") + with open(file_name, "r", encoding="utf8") as log_file: + for line in reversed(log_file.readlines()): + if "up and running" in line: + log.debug(f"Found running cluster for {cluster_id}.") + return True + if "Successfully terminated cluster" in line: + log.debug(f"Found cluster termination for {cluster_id}.") + return False + else: + log.debug(f"Log for {cluster_id} not found.") + log.debug(f"Found neither a running nor a terminated cluster for {cluster_id}.") + return False + + +@app.exception_handler(RequestValidationError) +async def validation_exception_handler(request: Request, exc: RequestValidationError): + exc_str = f'{exc}'.replace('\n', ' ').replace(' ', ' ') + logging.error(f"{request}: {exc_str}") + content = {'status_code': 10422, 'message': exc_str, 'data': None} + return JSONResponse(content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY) + + +@app.post("/bibigrid/validate", response_model=ValidationResponseModel) +async def validate_configuration(cluster_id: str = None, config_file: UploadFile = File(...)): + """ + Expects a cluster id and a + [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown/features/configuration.md) + file. + + Returns validation result (success, or failure) + * @param cluster_id: optional id of to be created cluster in order to log into the same file. + If not given, one is generated. + * @param config_file: [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown + /features/configuration.md) yaml file + * @return: success or failure of the validation + """ + cluster_id, log = setup(cluster_id) + LOG.info(f"Requested validation on {cluster_id}") + try: + content = await config_file.read() + configurations = yaml.safe_load(content.decode()) + providers = provider_handler.get_providers(configurations, log) + exit_state = check.check(configurations, providers, log) + if exit_state: + return JSONResponse( + content={"message": "Validation failed", "cluster_id": cluster_id, "success": exit_state}, + status_code=420) + return JSONResponse( + content={"message": "Validation successful", "cluster_id": cluster_id, "success": exit_state}, + status_code=200) + except Exception as exc: # pylint: disable=broad-except + return JSONResponse(content={"error": str(exc)}, status_code=400) + + +@app.post("/bibigrid/create", response_model=CreateResponseModel) +async def create_cluster(cluster_id: str = None, config_file: UploadFile = File(...)): + """ + Expects an optional cluster id and a + [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown/features/configuration.md) + file. + + Returns the cluster id and whether cluster creation (according to the configuration) has started. + The user then can check via [ready](#ready) if the cluster is ready. + * @param cluster_id: UUID with 15 letters. if not given, one is generated + * @param config_file: [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown + /features/configuration.md) yaml file + * @return: message whether the cluster creation has been started and cluster id + """ + LOG.debug(f"Requested creation on {cluster_id}") + cluster_id, log = setup(cluster_id) + + async def create_async(): + creator.create() + + try: + content = await config_file.read() + configurations = yaml.safe_load(content.decode()) + providers = provider_handler.get_providers(configurations, log) + creator = create.Create(providers=providers, configurations=configurations, log=log, + config_path=config_file.filename, cluster_id=cluster_id) + cluster_id = creator.cluster_id + asyncio.create_task(create_async()) + return JSONResponse(content={"message": "Cluster creation started.", "cluster_id": cluster_id}, status_code=202) + except Exception as exc: # pylint: disable=broad-except + return JSONResponse(content={"error": str(exc)}, status_code=400) + + +@app.post("/bibigrid/terminate", response_model=TerminateResponseModel) +async def terminate_cluster(cluster_id: str, config_file: UploadFile = File(...)): + """ + Expects a cluster id and a + [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown/features/configuration.md) + file. + + Returns whether cluster termination (according to the configuration) has started. + * @param cluster_id: id of cluster to terminate + * @param config_file: [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown + /features/configuration.md) yaml file + * @return: message whether the cluster termination has been started. + """ + cluster_id, log = setup(cluster_id) + LOG.debug(f"Requested termination on {cluster_id}") + + async def terminate_async(): + terminate.terminate(cluster_id, providers, log) + + try: + # Rewrite: Maybe load a configuration file stored somewhere locally to just define access + content = await config_file.read() + configurations = yaml.safe_load(content.decode()) + providers = provider_handler.get_providers(configurations, log) + asyncio.create_task(terminate_async()) + + return JSONResponse(content={"message": "Termination successfully requested."}, status_code=202) + except Exception as exc: # pylint: disable=broad-except + return JSONResponse(content={"error": str(exc)}, status_code=400) + + +@app.post("/bibigrid/info/", response_model=InfoResponseModel) +async def info(cluster_id: str, config_file: UploadFile): + """ + Expects a cluster id and a + [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown/features/configuration.md) + file. + + Returns detailed cluster information, including whether the cluster is "ready". + * @param cluster_id: id of cluster to get info on + * @param config_file: [configuration](https://github.com/BiBiServ/bibigrid/blob/master/documentation/markdown + /features/configuration.md) yaml file + * @return: detailed cluster information + """ + LOG.debug(f"Requested info on {cluster_id}.") + cluster_id, log = setup(cluster_id) + content = await config_file.read() + configurations = yaml.safe_load(content.decode()) + try: + providers = provider_handler.get_providers(configurations, log) + cluster_dict = list_clusters.dict_clusters(providers, log).get(cluster_id, {}) # add information filtering + if cluster_dict: + cluster_dict["message"] = "Cluster found." + cluster_dict["ready"] = is_up(cluster_id, log) + return JSONResponse(content=cluster_dict, status_code=200) + return JSONResponse(content={"message": "Cluster not found.", "ready": False}, status_code=404) + except Exception as exc: # pylint: disable=broad-except + return JSONResponse(content={"error": str(exc)}, status_code=400) + + +@app.get("/bibigrid/log/", response_model=LogResponseModel) +async def get_log(cluster_id: str, lines: int = None): + """ + Expects a cluster id and optional lines. + + Returns last lines of the .log for the given cluster id. If no lines are specified, all lines are returned. + * @param cluster_id: id of cluster to get .log from + * @param lines: lines to read from the end + * @return: Message whether the log has been found and if found, the las lines lines of the logged text + (or everything if lines were omitted). + """ + LOG.debug(f"Requested log on {cluster_id}.") + try: + file_name = os.path.join(LOG_FOLDER, f"{cluster_id}.log") + if os.path.isfile(file_name): + if not lines: + with open(file_name, "r", encoding="utf8") as log_file: + response = log_file.read() + else: + response = tail(file_name, lines) + return JSONResponse(content={"message": "Log found", "log": response}, status_code=200) + return JSONResponse(content={"message": "Log not found.", "log": None}, status_code=404) + except Exception as exc: # pylint: disable=broad-except + return JSONResponse(content={"error": str(exc)}, status_code=400) + + +@app.get("/bibigrid/ready/", response_model=ReadyResponseModel) +async def ready(cluster_id: str): + """ + Expects a cluster id. + + Returns whether the cluster with cluster id is ready according to the log file. + If the running state of the cluster has been changed outside BiBiGrid REST, this method cannot detect this. + + In such cases checking [info](#info)'s ready value is more reliable as it includes a check whether the cluster + actually exists on the provider. Ready omits checking the provider and is therefore less reliable, but faster. + * @param cluster_id: id of cluster to get ready state from + * @return: Message whether cluster is down or up and a bool "ready" whether the cluster is ready. + """ + LOG.debug(f"Requested log on {cluster_id}.") + try: + cluster_id, log = setup(cluster_id) + result = is_up(cluster_id, log) + return JSONResponse(content={"message": "Cluster is up" if result else "Cluster is down.", "ready": result}, + status_code=200) + except Exception as exc: # pylint: disable=broad-except + return JSONResponse(content={"error": str(exc)}, status_code=400) + + +# outdated tests +client = TestClient(app) + + +def test_validate(): + with open('test.yml', 'rb') as file: + response = client.post("/bibigrid/validate", files={"config_file": file}) + assert response.status_code == 200 + response_data = response.json() + assert response_data["message"] == "Validation successful" + assert "cluster_id" in response_data + + +def test_create(): + with open('test.yml', 'rb') as file: + response = client.post("/bibigrid/create", files={"config_file": file}) + assert response.status_code == 200 + response_data = response.json() + assert "cluster_id" in response_data + + +def test_terminate_cluster(): + with open('test.yml', 'rb') as file: + response = client.post("/bibigrid/terminate", params={"cluster_id": "2uiy5ka2c5y1k8o"}, + files={"config_file": file}) + assert response.status_code == 200 + response_data = response.json() + assert "message" in response_data + + +def test_info(): + # Assuming you've previously created a configuration with ID 1 + response = client.get("/bibigrid/info/1") + assert response.status_code == 200 + response_data = response.json() + assert bool(response_data) + + +def test_get_nonexistent_configuration_info(): + response = client.get("/bibigrid/info/999") + assert response.status_code == 404 + assert response.json() == {"error": "Configuration not found"} + + +# test_validate() +# test_create() # test_info() +# test_terminate_cluster() +# test_get_nonexistent_configuration_info() + + +if __name__ == "__main__": + uvicorn.run("bibigrid.core.startup_rest:app", host="0.0.0.0", port=8000, + workers=multiprocessing.cpu_count() * 2 + 1) # Use the on_starting event diff --git a/bibigrid/core/utility/ansible_configurator.py b/bibigrid/core/utility/ansible_configurator.py index a467a5f9..cc8f5040 100644 --- a/bibigrid/core/utility/ansible_configurator.py +++ b/bibigrid/core/utility/ansible_configurator.py @@ -2,7 +2,6 @@ Prepares ansible files (vars, common_configuration, ...) """ -import logging import os import mergedeep @@ -30,12 +29,12 @@ SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme", "munge_key": id_generation.generate_munge_key(), "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 900, "TreeWidth": 128}} -LOG = logging.getLogger("bibigrid") -def delete_old_vars(): +def delete_old_vars(log): """ Deletes host_vars and group_vars + @param log: @return: """ for folder in [aRP.GROUP_VARS_FOLDER, aRP.HOST_VARS_FOLDER]: @@ -43,7 +42,7 @@ def delete_old_vars(): # construct full file path file = os.path.join(folder, file_name) if os.path.isfile(file): - logging.debug('Deleting file: %s', file) + log.debug('Deleting file: %s', file) os.remove(file) @@ -67,7 +66,7 @@ def generate_site_file_yaml(custom_roles): return site_yaml -def write_host_and_group_vars(configurations, providers, cluster_id): # pylint: disable=too-many-locals +def write_host_and_group_vars(configurations, providers, cluster_id, log): # pylint: disable=too-many-locals """ ToDo filter what information really is necessary. Determined by further development Filters unnecessary information @@ -76,7 +75,7 @@ def write_host_and_group_vars(configurations, providers, cluster_id): # pylint: :param cluster_id: To get proper naming :return: filtered information (dict) """ - LOG.info("Generating instances file...") + log.info("Generating instances file...") flavor_keys = ["name", "ram", "vcpus", "disk", "ephemeral"] worker_count = 0 vpn_count = 0 @@ -84,14 +83,14 @@ def write_host_and_group_vars(configurations, providers, cluster_id): # pylint: configuration_features = configuration.get("features", []) if isinstance(configuration_features, str): configuration_features = [configuration_features] - for index, worker in enumerate(configuration.get("workerInstances", [])): + for worker in configuration.get("workerInstances", []): flavor = provider.get_flavor(worker["type"]) flavor_dict = {key: flavor[key] for key in flavor_keys} - name = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id, + name = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=f"[{worker_count}-{worker_count + worker.get('count', 1) - 1}]") group_name = name.replace("[", "").replace("]", "").replace(":", "_").replace("-", "_") worker_count += worker.get('count', 1) - regexp = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id, additional=r"\d+") + regexp = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=r"\d+") worker_dict = {"name": name, "regexp": regexp, "image": worker["image"], "network": configuration["network"], "flavor": flavor_dict, "gateway_ip": configuration["private_v4"], @@ -100,10 +99,10 @@ def write_host_and_group_vars(configurations, providers, cluster_id): # pylint: worker_features = worker.get("features", []) if isinstance(worker_features, str): worker_features = [worker_features] - features = set(configuration_features+worker_features) + features = set(configuration_features + worker_features) if features: worker_dict["features"] = features - write_yaml(os.path.join(aRP.GROUP_VARS_FOLDER, group_name), worker_dict) + write_yaml(os.path.join(aRP.GROUP_VARS_FOLDER, group_name), worker_dict, log) vpngtw = configuration.get("vpnInstance") if vpngtw: name = create.VPN_WORKER_IDENTIFIER(cluster_id=cluster_id, additional=f"{vpn_count}") @@ -113,32 +112,28 @@ def write_host_and_group_vars(configurations, providers, cluster_id): # pylint: flavor_dict = {key: flavor[key] for key in flavor_keys} regexp = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=r"\d+") vpngtw_dict = {"name": name, "regexp": regexp, "image": vpngtw["image"], - "network": configuration["network"], - "network_cidr": configuration["subnet_cidrs"], - "floating_ip": configuration["floating_ip"], - "private_v4": configuration["private_v4"], - "flavor": flavor_dict, - "wireguard_ip": wireguard_ip, - "cloud_identifier": configuration[ - "cloud_identifier"]} + "network": configuration["network"], "network_cidr": configuration["subnet_cidrs"], + "floating_ip": configuration["floating_ip"], "private_v4": configuration["private_v4"], + "flavor": flavor_dict, "wireguard_ip": wireguard_ip, + "cloud_identifier": configuration["cloud_identifier"], + "fallback_on_other_image": configuration.get("fallbackOnOtherImage", False)} if configuration.get("wireguard_peer"): - vpngtw_dict["wireguard"] = {"ip": wireguard_ip, - "peer": configuration.get( - "wireguard_peer")} - write_yaml(os.path.join(aRP.HOST_VARS_FOLDER, name), vpngtw_dict) + vpngtw_dict["wireguard"] = {"ip": wireguard_ip, "peer": configuration.get("wireguard_peer")} + write_yaml(os.path.join(aRP.HOST_VARS_FOLDER, name), vpngtw_dict, log) else: master = configuration["masterInstance"] name = create.MASTER_IDENTIFIER(cluster_id=cluster_id) flavor = provider.get_flavor(master["type"]) flavor_dict = {key: flavor[key] for key in flavor_keys} master_dict = {"name": name, "image": master["image"], "network": configuration["network"], - "network_cidr": configuration["subnet_cidrs"], - "floating_ip": configuration["floating_ip"], "flavor": flavor_dict, - "private_v4": configuration["private_v4"], - "cloud_identifier": configuration["cloud_identifier"]} + "network_cidr": configuration["subnet_cidrs"], "floating_ip": configuration["floating_ip"], + "flavor": flavor_dict, "private_v4": configuration["private_v4"], + "cloud_identifier": configuration["cloud_identifier"], + "volumes": configuration["volumes"], + "fallback_on_other_image": configuration.get("fallbackOnOtherImage", False)} if configuration.get("wireguard_peer"): master_dict["wireguard"] = {"ip": "10.0.0.1", "peer": configuration.get("wireguard_peer")} - write_yaml(os.path.join(aRP.GROUP_VARS_FOLDER, "master.yml"), master_dict) + write_yaml(os.path.join(aRP.GROUP_VARS_FOLDER, "master.yml"), master_dict, log) def pass_through(dict_from, dict_to, key_from, key_to=None): @@ -156,7 +151,7 @@ def pass_through(dict_from, dict_to, key_from, key_to=None): dict_to[key_to] = dict_from[key_from] -def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_user, default_user): +def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_user, default_user, log): """ Generates common_configuration yaml (dict) :param cidrs: str subnet cidrs (provider generated) @@ -164,12 +159,14 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us :param cluster_id: id of cluster :param ssh_user: user for ssh connections :param default_user: Given default user + :param log: :return: common_configuration_yaml (dict) """ master_configuration = configurations[0] - LOG.info("Generating common configuration file...") + log.info("Generating common configuration file...") # print(configuration.get("slurmConf", {})) - common_configuration_yaml = {"cluster_id": cluster_id, "cluster_cidrs": cidrs, "default_user": default_user, + common_configuration_yaml = {"auto_mount": master_configuration.get("autoMount", False), + "cluster_id": cluster_id, "cluster_cidrs": cidrs, "default_user": default_user, "local_fs": master_configuration.get("localFS", False), "local_dns_lookup": master_configuration.get("localDNSlookup", False), "use_master_as_compute": master_configuration.get("useMasterAsCompute", True), @@ -209,15 +206,16 @@ def generate_common_configuration_yaml(cidrs, configurations, cluster_id, ssh_us return common_configuration_yaml -def generate_ansible_hosts_yaml(ssh_user, configurations, cluster_id): +def generate_ansible_hosts_yaml(ssh_user, configurations, cluster_id, log): # pylint: disable-msg=too-many-locals """ Generates ansible_hosts_yaml (inventory file). :param ssh_user: str global SSH-username :param configurations: dict :param cluster_id: id of cluster + :param log: :return: ansible_hosts yaml (dict) """ - LOG.info("Generating ansible hosts file...") + log.info("Generating ansible hosts file...") ansible_hosts_yaml = {"vpn": {"hosts": {}, "children": {"master": {"hosts": {"localhost": to_instance_host_dict(ssh_user)}}, "vpngtw": {"hosts": {}}}}, "workers": {"hosts": {}, "children": {}}} @@ -227,8 +225,8 @@ def generate_ansible_hosts_yaml(ssh_user, configurations, cluster_id): worker_count = 0 vpngtw_count = 0 for configuration in configurations: - for index, worker in enumerate(configuration.get("workerInstances", [])): - name = create.WORKER_IDENTIFIER(worker_group=index, cluster_id=cluster_id, + for worker in configuration.get("workerInstances", []): + name = create.WORKER_IDENTIFIER(cluster_id=cluster_id, additional=f"[{worker_count}:{worker_count + worker.get('count', 1) - 1}]") worker_dict = to_instance_host_dict(ssh_user, ip="") group_name = name.replace("[", "").replace("]", "").replace(":", "_").replace("-", "_") @@ -274,7 +272,7 @@ def get_cidrs(configurations): return all_cidrs -def get_ansible_roles(ansible_roles): +def get_ansible_roles(ansible_roles, log): """ Checks if ansible_roles have all necessary values and returns True if so. :param ansible_roles: ansible_roles from master configuration (first configuration) @@ -289,14 +287,15 @@ def get_ansible_roles(ansible_roles): ansible_role_dict[key] = ansible_role[key] ansible_roles_yaml.append(ansible_role_dict) else: - LOG.warning("Ansible role %s had neither galaxy,git nor url. Not added.", ansible_role) + log.warning("Ansible role %s had neither galaxy,git nor url. Not added.", ansible_role) return ansible_roles_yaml -def get_ansible_galaxy_roles(ansible_galaxy_roles): +def get_ansible_galaxy_roles(ansible_galaxy_roles, log): """ Checks if ansible_galaxy_role have all necessary values and adds it to the return list if so. :param ansible_galaxy_roles: + :param log: :return: list of valid ansible_galaxy_roles """ ansible_galaxy_roles_yaml = [] @@ -308,17 +307,18 @@ def get_ansible_galaxy_roles(ansible_galaxy_roles): ansible_galaxy_role_dict[key] = ansible_galaxy_role[key] ansible_galaxy_roles_yaml.append(ansible_galaxy_role_dict) else: - LOG.warning("Galaxy role %s had neither galaxy,git nor url. Not added.", ansible_galaxy_role) + log.warning("Galaxy role %s had neither galaxy,git nor url. Not added.", ansible_galaxy_role) return ansible_galaxy_roles_yaml -def generate_worker_specification_file_yaml(configurations): +def generate_worker_specification_file_yaml(configurations, log): """ Generates worker_specification_file_yaml :param configurations: list of configurations (dict) + :param log: :return: worker_specification_yaml """ - LOG.info("Generating worker specification file...") + log.info("Generating worker specification file...") worker_groups_list = configuration_handler.get_list_by_key(configurations, "workerInstances", False) # create.prepare_configuration guarantees that key is set network_list = configuration_handler.get_list_by_key(configurations, "network", False) @@ -330,15 +330,16 @@ def generate_worker_specification_file_yaml(configurations): return worker_specification_yaml -def write_yaml(path, generated_yaml, alias=False): +def write_yaml(path, generated_yaml, log, alias=False): """ Writes generated_yaml to file path with or without alias @param path: @param generated_yaml: + @param log: @param alias: @return: """ - LOG.debug("Writing yaml %s", path) + log.debug("Writing yaml %s", path) with open(path, mode="w+", encoding="UTF-8") as file: if alias: yaml.safe_dump(data=generated_yaml, stream=file) @@ -360,27 +361,32 @@ def add_wireguard_peers(configurations): "subnet": configuration["subnet_cidrs"]} -def configure_ansible_yaml(providers, configurations, cluster_id): +def configure_ansible_yaml(providers, configurations, cluster_id, log): """ Generates and writes all ansible-configuration-yaml files. :param providers: list of providers :param configurations: list of configurations (dict) :param cluster_id: id of cluster to create + :param log: :return: """ - delete_old_vars() - LOG.info("Writing ansible files...") + delete_old_vars(log) + log.info("Writing ansible files...") alias = configurations[0].get("aliasDumper", False) - ansible_roles = get_ansible_roles(configurations[0].get("ansibleRoles")) + ansible_roles = get_ansible_roles(configurations[0].get("ansibleRoles"), log) default_user = providers[0].cloud_specification["auth"].get("username", configurations[0].get("sshUser", "Ubuntu")) add_wireguard_peers(configurations) for path, generated_yaml in [ - (aRP.WORKER_SPECIFICATION_FILE, generate_worker_specification_file_yaml(configurations)), ( + (aRP.WORKER_SPECIFICATION_FILE, generate_worker_specification_file_yaml(configurations, log)), ( aRP.COMMONS_CONFIG_FILE, generate_common_configuration_yaml(cidrs=get_cidrs(configurations), configurations=configurations, cluster_id=cluster_id, ssh_user=configurations[0]["sshUser"], - default_user=default_user)), - (aRP.HOSTS_CONFIG_FILE, generate_ansible_hosts_yaml(configurations[0]["sshUser"], configurations, cluster_id)), + default_user=default_user, log=log)), (aRP.HOSTS_CONFIG_FILE, + generate_ansible_hosts_yaml( + configurations[0][ + "sshUser"], + configurations, + cluster_id, log)), (aRP.SITE_CONFIG_FILE, generate_site_file_yaml(ansible_roles))]: - write_yaml(path, generated_yaml, alias) - write_host_and_group_vars(configurations, providers, cluster_id) # writing included in method + write_yaml(path, generated_yaml, log, alias) + write_host_and_group_vars(configurations, providers, cluster_id, log) # writing included in method diff --git a/bibigrid/core/utility/command_line_interpreter.py b/bibigrid/core/utility/command_line_interpreter.py index 31d07e0b..30fed2c9 100644 --- a/bibigrid/core/utility/command_line_interpreter.py +++ b/bibigrid/core/utility/command_line_interpreter.py @@ -6,7 +6,8 @@ import logging import os -STANDARD_CONFIG_INPUT_PATH = os.path.expanduser("~/.config/bibigrid") +INPUT_PATH = "~/.config/bibigrid" +STANDARD_CONFIG_INPUT_PATH = os.path.expanduser(INPUT_PATH) FOLDER_START = ("~/", "/") LOG = logging.getLogger("bibigrid") @@ -15,12 +16,12 @@ def check_cid(cid): if "-" in cid: new_cid = cid.split("-")[-1] LOG.info("-cid %s is not a cid, but probably the entire master name. Using '%s' as " - "cid instead.", cid, new_cid) + "cid instead.", cid, new_cid) return new_cid if "." in cid: LOG.info("-cid %s is not a cid, but probably the master's ip. " - "Using the master ip instead of cid only works if a cluster key is in your systems default ssh key " - "location (~/.ssh/). Otherwise bibigrid can't identify the cluster key.") + "Using the master ip instead of cid only works if a cluster key is in your systems default ssh key " + "location (~/.ssh/). Otherwise bibigrid can't identify the cluster key.") return cid @@ -29,31 +30,36 @@ def interpret_command_line(): Interprets commandline. Used in startup.py :return: """ - parser = argparse.ArgumentParser(description='Bibigrid sets up cluster easily inside a cloud environment') + parser = argparse.ArgumentParser(description='BiBiGrid easily sets up clusters within a cloud environment') parser.add_argument("-v", "--verbose", action="count", default=0, help="Increases logging verbosity. `-v` adds more info to the logfile, " - "`-vv` adds debug information to the logfile.") - parser.add_argument("-d", "--debug", action='store_true', help="Keeps cluster active. Asks before shutdown. " - "Offers termination after create") + "`-vv` adds debug information to the logfile") + parser.add_argument("-d", "--debug", action='store_true', help="Keeps cluster active even when crashing. " + "Asks before shutdown. " + "Offers termination after successful create") parser.add_argument("-i", "--config_input", metavar="", help="Path to YAML configurations file. " "Relative paths can be used and start " - "at ~/.config/bibigrid", required=True, + f"at '{INPUT_PATH}'. " + "Required for all actions but '--version'", type=lambda s: s if s.startswith(FOLDER_START) else os.path.join(STANDARD_CONFIG_INPUT_PATH, s)) parser.add_argument("-cid", "--cluster_id", metavar="", type=check_cid, default="", - help="Cluster id is needed for ide and termination") - + help="Cluster id is needed for '--ide', '--terminate_cluster' and '--update'. " + "If not set, last created cluster's id is used") actions = parser.add_mutually_exclusive_group(required=True) actions.add_argument("-V", "--version", action='store_true', help="Displays version") - actions.add_argument("-t", "--terminate_cluster", action='store_true', + actions.add_argument("-t", "--terminate", action='store_true', help="Terminates cluster. Needs cluster-id set.") actions.add_argument("-c", "--create", action='store_true', help="Creates cluster") - actions.add_argument("-l", "--list_clusters", action='store_true', + actions.add_argument("-l", "--list", action='store_true', help="Lists all running clusters. If cluster-id is set, will list this cluster in detail only") actions.add_argument("-ch", "--check", action='store_true', help="Validates cluster configuration") actions.add_argument("-ide", "--ide", action='store_true', - help="Establishes a secured connection to ide. Needs cluster-id set") + help="Establishes a secure connection to ide. Needs cluster-id set") actions.add_argument("-u", "--update", action='store_true', help="Updates master's playbook. " - "Needs cluster-id set, no job running " + "Needs cluster-id set, no jobs running " "and no workers up") args = parser.parse_args() + needs_config = args.terminate or args.create or args.list or args.check or args.ide + if needs_config and not args.config_input: + parser.error("requested action requires '-i' ('--config_input')") return args diff --git a/bibigrid/core/utility/handler/cluster_ssh_handler.py b/bibigrid/core/utility/handler/cluster_ssh_handler.py index 138011d2..7ffd7ea5 100644 --- a/bibigrid/core/utility/handler/cluster_ssh_handler.py +++ b/bibigrid/core/utility/handler/cluster_ssh_handler.py @@ -2,37 +2,37 @@ This module gets information about ssh connection. """ -import logging import os from bibigrid.core.actions import create, list_clusters -LOG = logging.getLogger("bibigrid") -def get_ssh_connection_info(cluster_id, master_provider, master_configuration): + +def get_ssh_connection_info(cluster_id, master_provider, master_configuration, log): """ Gets master_ip, ssh_user and private key to enable other modules to create an ssh connection to a clusters master @param cluster_id: id of cluster to connect to @param master_provider: master's provider @param master_configuration: master's configuration + @param log: @return: triple (master_ip, ssh_user, private_key) """ # If cluster_id is an ip, cluster_id will be used for master_ip if "." in cluster_id: - LOG.info("Interpreting %s as ip since it doesn't match cluster_id", cluster_id) + log.info("Interpreting %s as ip since it doesn't match cluster_id", cluster_id) master_ip = cluster_id else: - master_ip = list_clusters.get_master_access_ip(cluster_id, master_provider) + master_ip = list_clusters.get_master_access_ip(cluster_id, master_provider, log) ssh_user = master_configuration.get("sshUser") public_keys = master_configuration.get("sshPublicKeyFiles") used_private_key = None # first check configuration then if not found take the temporary key - if public_keys: - public_key = public_keys[0] + for public_key in public_keys: if isinstance(public_key, str): - private_key = public_key.strip(".pub") + private_key = public_key[:-4] if os.path.isfile(private_key): used_private_key = private_key + break if not used_private_key: private_key = os.path.join(create.KEY_FOLDER, create.KEY_NAME.format(cluster_id=cluster_id)) if os.path.isfile(private_key): diff --git a/bibigrid/core/utility/handler/configuration_handler.py b/bibigrid/core/utility/handler/configuration_handler.py index 4c8aa23c..c50a7f3d 100644 --- a/bibigrid/core/utility/handler/configuration_handler.py +++ b/bibigrid/core/utility/handler/configuration_handler.py @@ -2,7 +2,6 @@ This module contains methods to read the configuration and cloud specification. """ -import logging import os import mergedeep @@ -16,12 +15,11 @@ CLOUDS_PUBLIC_NAME_KEY = "profile" CLOUD_CONFIGURATION_KEY = "cloud" -LOG = logging.getLogger("bibigrid") - -def read_configuration(path="bibigrid.yml", configuration_list=True): +def read_configuration(log, path="bibigrid.yml", configuration_list=True): """ Reads yaml from file and returns configuration + :param log: :param path: Path to yaml file :param configuration_list: True if list is expected :return: configurations (dict) @@ -32,11 +30,11 @@ def read_configuration(path="bibigrid.yml", configuration_list=True): try: configuration = yaml.safe_load(stream) except yaml.YAMLError as exc: - LOG.warning("Couldn't read configuration %s: %s", path, exc) + log.warning("Couldn't read configuration %s: %s", path, exc) else: - LOG.warning("No such configuration file %s.", path) + log.warning("No such configuration file %s.", path) if configuration_list and not isinstance(configuration, list): - LOG.warning("Configuration should be list. Attempting to rescue by assuming a single configuration.") + log.warning("Configuration should be list. Attempting to rescue by assuming a single configuration.") return [configuration] return configuration @@ -57,51 +55,53 @@ def get_list_by_key(configurations, key, get_empty=True): # for configuration in configurations] -def find_file_in_folders(file_name, folders): +def find_file_in_folders(file_name, folders, log): """ Searches all folders for a file with name file_name, loads (expects yaml) the first match and returns the dict @param file_name: name of the file to look for @param folders: folders to search for file named file_name + @param log: @return: dict of match content or None if not found """ for folder_path in folders: file_path = os.path.expanduser(os.path.join(folder_path, file_name)) if os.path.isfile(file_path): - LOG.debug("File %s found in folder %s.", file_name, folder_path) - return read_configuration(file_path, False) - LOG.debug("File %s not found in folder %s.", file_name, folder_path) + log.debug("File %s found in folder %s.", file_name, folder_path) + return read_configuration(log, file_path, False) + log.debug("File %s not found in folder %s.", file_name, folder_path) return None -def get_clouds_files(): +def get_clouds_files(log): """ Wrapper to call find_file_in_folders with the right arguments to find the clouds.yaml and clouds-public.yaml @return: tuple of dicts containing the clouds.yaml and clouds-public.yaml data or None if not found. """ - clouds_yaml = find_file_in_folders(CLOUDS_YAML, CLOUDS_YAML_PATHS) - clouds_public_yaml = find_file_in_folders(CLOUDS_PUBLIC_YAML, CLOUDS_YAML_PATHS) + clouds_yaml = find_file_in_folders(CLOUDS_YAML, CLOUDS_YAML_PATHS, log) + clouds_public_yaml = find_file_in_folders(CLOUDS_PUBLIC_YAML, CLOUDS_YAML_PATHS, log) clouds = None clouds_public = None if clouds_yaml: clouds = clouds_yaml.get(CLOUD_ROOT_KEY) if not clouds: - LOG.warning("%s is not valid. Must contain key '%s:'", CLOUDS_YAML, CLOUD_ROOT_KEY) + log.warning("%s is not valid. Must contain key '%s:'", CLOUDS_YAML, CLOUD_ROOT_KEY) else: - LOG.warning("No %s at %s! Please copy your %s to one of those listed folders. Aborting...", CLOUDS_YAML, + log.warning("No %s at %s! Please copy your %s to one of those listed folders. Aborting...", CLOUDS_YAML, CLOUDS_YAML_PATHS, CLOUDS_YAML) if clouds_public_yaml: clouds_public = clouds_public_yaml.get(CLOUD_PUBLIC_ROOT_KEY) if not clouds_public: - LOG.warning("%s is not valid. Must contain key '%s'", CLOUDS_PUBLIC_YAML, CLOUD_PUBLIC_ROOT_KEY) + log.warning("%s is not valid. Must contain key '%s'", CLOUDS_PUBLIC_YAML, CLOUD_PUBLIC_ROOT_KEY) return clouds, clouds_public -def get_cloud_specification(cloud_name, clouds, clouds_public): +def get_cloud_specification(cloud_name, clouds, clouds_public, log): """ As in openstack cloud_public_specification will be overwritten by cloud_private_specification :param cloud_name: name of the cloud to look for in clouds.yaml :param clouds: dict containing the data loaded from clouds.yaml :param clouds_public: dict containing the data loaded from clouds-public.yaml + :param log: :return: """ cloud_full_specification = {} @@ -110,42 +110,43 @@ def get_cloud_specification(cloud_name, clouds, clouds_public): cloud_full_specification = cloud_private_specification public_cloud_name = cloud_private_specification.get(CLOUDS_PUBLIC_NAME_KEY) if public_cloud_name and clouds_public: - LOG.debug("Trying to find profile...") + log.debug("Trying to find profile...") cloud_public_specification = clouds_public.get(public_cloud_name) if not cloud_public_specification: - LOG.warning("%s is not a valid profile name. " + log.warning("%s is not a valid profile name. " "Must be contained under key '%s'", public_cloud_name, CLOUD_PUBLIC_ROOT_KEY) else: - LOG.debug("Profile found. Merging begins...") + log.debug("Profile found. Merging begins...") try: mergedeep.merge(cloud_full_specification, cloud_public_specification, strategy=mergedeep.Strategy.TYPESAFE_REPLACE) except TypeError as exc: - LOG.warning("Existing %s and %s configuration keys don't match in type: %s", CLOUDS_YAML, + log.warning("Existing %s and %s configuration keys don't match in type: %s", CLOUDS_YAML, CLOUDS_PUBLIC_YAML, exc) return {} else: - LOG.debug("Using only clouds.yaml since no clouds-public profile is set.") + log.debug("Using only clouds.yaml since no clouds-public profile is set.") if not cloud_full_specification.get("identifier"): cloud_full_specification["identifier"] = cloud_name else: - LOG.warning("%s is not a valid cloud name. Must be contained under key '%s'", cloud_name, CLOUD_ROOT_KEY) + log.warning("%s is not a valid cloud name. Must be contained under key '%s'", cloud_name, CLOUD_ROOT_KEY) return cloud_full_specification -def get_cloud_specifications(configurations): +def get_cloud_specifications(configurations, log): """ Calls get_cloud_specification to get the cloud_specification for every configuration @param configurations: + @param log: @return: list of dicts: cloud_specifications of every configuration """ - clouds, clouds_public = get_clouds_files() - LOG.debug("Loaded clouds.yml and clouds_public.yml") + clouds, clouds_public = get_clouds_files(log) + log.debug("Loaded clouds.yml and clouds_public.yml") cloud_specifications = [] if isinstance(clouds, dict): for configuration in configurations: cloud = configuration.get(CLOUD_CONFIGURATION_KEY) if cloud: - cloud_specifications.append(get_cloud_specification(cloud, clouds, clouds_public)) # might be None + cloud_specifications.append(get_cloud_specification(cloud, clouds, clouds_public, log)) # might be None return cloud_specifications diff --git a/bibigrid/core/utility/handler/logging_path_handler.py b/bibigrid/core/utility/handler/logging_path_handler.py index 42031452..7bd1ea74 100644 --- a/bibigrid/core/utility/handler/logging_path_handler.py +++ b/bibigrid/core/utility/handler/logging_path_handler.py @@ -2,16 +2,13 @@ This module holds methods to return the logfile's path. """ -import logging -LOG = logging.getLogger("bibigrid") - -def get_logging_path(): +def get_logging_path(log): """ Returns the path were the logfile is stored @return: the path were the logfile is stored """ - for handler in LOG.getLoggerClass().root.handlers: + for handler in log.getLoggerClass().root.handlers: if hasattr(handler, 'baseFilename'): log_path = handler.baseFilename return log_path diff --git a/bibigrid/core/utility/handler/provider_handler.py b/bibigrid/core/utility/handler/provider_handler.py index 18668b2f..3e74b1e9 100644 --- a/bibigrid/core/utility/handler/provider_handler.py +++ b/bibigrid/core/utility/handler/provider_handler.py @@ -2,26 +2,25 @@ This module contains different selectors to pick and create a connection to the right provider. """ -import logging - from bibigrid.core.utility.handler import configuration_handler from bibigrid.openstack import openstack_provider PROVIDER_NAME_DICT = {"openstack": openstack_provider.OpenstackProvider} PROVIDER_CLASS_DICT = {provider.__name__: provider for provider in PROVIDER_NAME_DICT.values()} -LOG = logging.getLogger("bibigrid") -def get_provider_by_class_name(provider_name, provider_dict=PROVIDER_CLASS_DICT): # pylint: disable=dangerous-default-value + +def get_provider_by_class_name(provider_name, + provider_dict=PROVIDER_CLASS_DICT): # pylint: disable=dangerous-default-value """ Returns provider that is associated with the key provider_name in provider_dict. - Otherwise a KeyError is thrown. + Otherwise, a KeyError is thrown. :param provider_name: key of provider_dict :return: provider """ return provider_dict[provider_name] -def get_provider_by_name(provider_name, provider_dict=PROVIDER_NAME_DICT): # pylint: disable=dangerous-default-value +def get_provider_by_name(provider_name, provider_dict=PROVIDER_NAME_DICT): # pylint: disable=dangerous-default-value """ Returns provider that is associated with the key provider_name in provider_dict. Otherwise a KeyError is thrown. @@ -41,24 +40,25 @@ def get_provider_list_by_name_list(provider_name_list, cloud_specifications): :return: list of providers """ provider_list = [ - (get_provider_by_name(provider_name) or get_provider_by_class_name(provider_name))(cloud_specification) - for provider_name, cloud_specification in zip(provider_name_list, cloud_specifications)] + (get_provider_by_name(provider_name) or get_provider_by_class_name(provider_name))(cloud_specification) for + provider_name, cloud_specification in zip(provider_name_list, cloud_specifications)] return provider_list -def get_providers(configurations): +def get_providers(configurations, log): """ Reads list of provider_names from configurations. Determines list of providers by provider_names and returns it. If providers don't match a key error is thrown and the program exits with failure state 1. :param configurations: + :param log: :return: """ - cloud_specifications = configuration_handler.get_cloud_specifications(configurations) + cloud_specifications = configuration_handler.get_cloud_specifications(configurations, log) if cloud_specifications: try: provider_names = configuration_handler.get_list_by_key(configurations, "infrastructure") return get_provider_list_by_name_list(provider_names, cloud_specifications) except KeyError as exc: - LOG.warning("Check infrastructure in configurations! Key: %s", str(exc)) + log.warning("Check infrastructure in configurations! Key: %s", str(exc)) return None diff --git a/bibigrid/core/utility/handler/ssh_handler.py b/bibigrid/core/utility/handler/ssh_handler.py index 222bcfd0..a705a405 100644 --- a/bibigrid/core/utility/handler/ssh_handler.py +++ b/bibigrid/core/utility/handler/ssh_handler.py @@ -2,29 +2,24 @@ This module handles ssh and sftp connections to master and vpngtw. It also holds general execution routines used to setup the Cluster. """ - -import logging import os import socket import time import paramiko import yaml +import sympy from bibigrid.core.utility import ansible_commands as aC from bibigrid.models.exceptions import ConnectionException, ExecutionException PRIVATE_KEY_FILE = ".ssh/id_ecdsa" # to name bibigrid-temp keys identically on remote -ANSIBLE_SETUP = [aC.NO_UPDATE, aC.UPDATE, - aC.PYTHON3_PIP, aC.ANSIBLE_PASSLIB, - (f"chmod 600 {PRIVATE_KEY_FILE}", "Adjust private key permissions."), - aC.PLAYBOOK_HOME, - aC.PLAYBOOK_HOME_RIGHTS, - aC.ADD_PLAYBOOK_TO_LINUX_HOME] +ANSIBLE_SETUP = [aC.NO_UPDATE, aC.UPDATE, aC.PYTHON3_PIP, aC.ANSIBLE_PASSLIB, + (f"chmod 600 {PRIVATE_KEY_FILE}", "Adjust private key permissions."), aC.PLAYBOOK_HOME, + aC.PLAYBOOK_HOME_RIGHTS, aC.ADD_PLAYBOOK_TO_LINUX_HOME] # ANSIBLE_START = [aC.WAIT_READY, aC.UPDATE, aC.MV_ANSIBLE_CONFIG, aC.EXECUTE] # another UPDATE seems to not necessary. ANSIBLE_START = [aC.WAIT_READY, aC.MV_ANSIBLE_CONFIG, aC.EXECUTE] VPN_SETUP = [("echo Example", "Echos an Example")] -LOG = logging.getLogger("bibigrid") def get_ac_command(providers, name): @@ -41,16 +36,14 @@ def get_ac_command(providers, name): if auth.get("application_credential_id") and auth.get("application_credential_secret"): wanted_keys = ["auth", "region_name", "interface", "identity_api_version", "auth_type"] ac_cloud_specification = {wanted_key: cloud_specification[wanted_key] for wanted_key in wanted_keys if - wanted_key in - cloud_specification} + wanted_key in cloud_specification} else: wanted_keys = ["region_name", "interface", "identity_api_version"] ac = provider.create_application_credential(name=name) # pylint: disable=invalid-name ac_dict = {"application_credential_id": ac["id"], "application_credential_secret": ac["secret"], "auth_type": "v3applicationcredential", "auth_url": auth["auth_url"]} ac_cloud_specification = {wanted_key: cloud_specification[wanted_key] for wanted_key in wanted_keys if - wanted_key in - cloud_specification} + wanted_key in cloud_specification} ac_cloud_specification.update(ac_dict) ac_clouds_yaml["clouds"][cloud_specification["identifier"]] = ac_cloud_specification return (f"echo '{yaml.safe_dump(ac_clouds_yaml)}' | sudo install -D /dev/stdin /etc/openstack/clouds.yaml", @@ -72,28 +65,29 @@ def get_add_ssh_public_key_commands(ssh_public_key_files): return commands -def copy_to_server(sftp, localpath, remotepath): +def copy_to_server(sftp, local_path, remote_path, log): """ Recursively copies files and folders to server. - If a folder is given as localpath, the structure within will be kept. + If a folder is given as local_path, the structure within will be kept. :param sftp: sftp connection - :param localpath: file or folder locally - :param remotepath: file or folder locally + :param local_path: file or folder locally + :param remote_path: file or folder locally + :param log: :return: """ - LOG.debug("Copy %s to %s...", localpath, remotepath) - if os.path.isfile(localpath): - sftp.put(localpath, remotepath) + log.debug("Copy %s to %s...", local_path, remote_path) + if os.path.isfile(local_path): + sftp.put(local_path, remote_path) else: try: - sftp.mkdir(remotepath) + sftp.mkdir(remote_path) except OSError: pass - for filename in os.listdir(localpath): - copy_to_server(sftp, localpath + "/" + filename, remotepath + "/" + filename) + for filename in os.listdir(local_path): + copy_to_server(sftp, os.path.join(local_path, filename), os.path.join(remote_path, filename), log) -def is_active(client, floating_ip_address, private_key, username, timeout=5): +def is_active(client, floating_ip_address, private_key, username, log, gateway, timeout=5): """ Checks if connection is possible and therefore if server is active. Raises paramiko.ssh_exception.NoValidConnectionsError if timeout is reached @@ -101,33 +95,41 @@ def is_active(client, floating_ip_address, private_key, username, timeout=5): :param floating_ip_address: ip to connect to :param private_key: SSH-private_key :param username: SSH-username + :param log: :param timeout: how long to wait between ping + :param gateway: if node should be reached over a gateway port is set to 30000 + subnet * 256 + host (waiting grows quadratically till 2**timeout before accepting failure) """ attempts = 0 establishing_connection = True while establishing_connection: try: - client.connect(hostname=floating_ip_address, username=username, pkey=private_key, timeout=7, auth_timeout=5) + port = 22 + if gateway: + log.info(f"Using SSH Gateway {gateway.get('ip')}") + octets = {f'oct{enum+1}': int(elem) for enum, elem in enumerate(floating_ip_address.split("."))} + port = int(sympy.sympify(gateway["portFunction"]).subs(dict(octets))) + client.connect(hostname=gateway.get("ip") or floating_ip_address, username=username, + pkey=private_key, timeout=7, auth_timeout=5, port=port) establishing_connection = False - LOG.info(f"Successfully connected to {floating_ip_address}") + log.info(f"Successfully connected to {floating_ip_address}") except paramiko.ssh_exception.NoValidConnectionsError as exc: - LOG.info(f"Attempting to connect to {floating_ip_address}... This might take a while", ) + log.info(f"Attempting to connect to {floating_ip_address}... This might take a while", ) if attempts < timeout: time.sleep(2 ** attempts) attempts += 1 else: - LOG.error(f"Attempt to connect to {floating_ip_address} failed.") + log.error(f"Attempt to connect to {floating_ip_address} failed.") raise ConnectionException(exc) from exc except socket.timeout as exc: - LOG.warning("Socket timeout exception occurred. Try again ...") + log.warning("Socket timeout exception occurred. Try again ...") if attempts < timeout: attempts += 1 else: - LOG.error(f"Attempt to connect to {floating_ip_address} failed, due to a socket timeout.") + log.error(f"Attempt to connect to {floating_ip_address} failed, due to a socket timeout.") raise ConnectionException(exc) from exc except TimeoutError as exc: # pylint: disable=duplicate-except - LOG.error("The attempt to connect to %s failed. Possible known reasons:" + log.error("The attempt to connect to %s failed. Possible known reasons:" "\n\t-Your network's security group doesn't allow SSH.", floating_ip_address) raise ConnectionException(exc) from exc @@ -148,25 +150,26 @@ def line_buffered(f): line_buf = b'' -def execute_ssh_cml_commands(client, commands): +def execute_ssh_cml_commands(client, commands, log): """ Executes commands and logs exit_status accordingly. :param client: Client with connection to remote :param commands: Commands to execute on remote + :param log: """ for command in commands: ssh_stdin, ssh_stdout, ssh_stderr = client.exec_command(command[0]) # pylint: disable=unused-variable ssh_stdout.channel.set_combine_stderr(True) - LOG.info(f"REMOTE: {command[1]}") + log.info(f"REMOTE: {command[1]}") while True: line = ssh_stdout.readline() if len(line) == 0: break if "[BIBIGRID]" in line: - LOG.info(f"REMOTE: {line.strip()}") + log.info(f"REMOTE: {line.strip()}") else: - LOG.debug(f"REMOTE: {line.strip()}") + log.debug(f"REMOTE: {line.strip()}") # get exit status exit_status = ssh_stdout.channel.recv_exit_status() @@ -175,11 +178,11 @@ def execute_ssh_cml_commands(client, commands): if exit_status: msg = f"{command[1]} ... Exit status: {exit_status}" - LOG.warning(msg) + log.warning(msg) raise ExecutionException(msg) -def ansible_preparation(floating_ip, private_key, username, commands=None, filepaths=None): +def ansible_preparation(floating_ip, private_key, username, log, gateway, commands=None, filepaths=None): """ Installs python and pip. Then installs ansible over pip. Copies private key to instance so cluster-nodes are reachable and sets permission as necessary. @@ -189,27 +192,31 @@ def ansible_preparation(floating_ip, private_key, username, commands=None, filep :param floating_ip: public ip of server to ansible-prepare :param private_key: generated private key of all cluster-server :param username: username of all server + :param log: :param commands: additional commands to execute :param filepaths: additional files to copy: (localpath, remotepath) + :param gateway """ if filepaths is None: filepaths = [] if commands is None: commands = [] - LOG.info("Ansible preparation...") + log.info("Ansible preparation...") commands = ANSIBLE_SETUP + commands filepaths.append((private_key, PRIVATE_KEY_FILE)) - execute_ssh(floating_ip, private_key, username, commands, filepaths) + execute_ssh(floating_ip, private_key, username, log, gateway, commands, filepaths) -def execute_ssh(floating_ip, private_key, username, commands=None, filepaths=None): +def execute_ssh(floating_ip, private_key, username, log, gateway, commands=None, filepaths=None): """ Executes commands on remote and copies files given in filepaths :param floating_ip: public ip of remote :param private_key: key of remote :param username: username of remote :param commands: commands + :param log: :param filepaths: filepaths (localpath, remotepath) + :param gateway: gateway if used """ if commands is None: commands = [] @@ -217,21 +224,19 @@ def execute_ssh(floating_ip, private_key, username, commands=None, filepaths=Non with paramiko.SSHClient() as client: client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) try: - is_active(client=client, - floating_ip_address=floating_ip, - username=username, - private_key=paramiko_key) + is_active(client=client, floating_ip_address=floating_ip, username=username, private_key=paramiko_key, + log=log, gateway=gateway) except ConnectionException as exc: - LOG.error(f"Couldn't connect to floating ip {floating_ip} using private key {private_key}.") + log.error(f"Couldn't connect to ip {gateway or floating_ip} using private key {private_key}.") raise exc else: - LOG.debug(f"Setting up {floating_ip}") + log.debug(f"Setting up {floating_ip}") if filepaths: - LOG.debug(f"Setting up filepaths for {floating_ip}") + log.debug(f"Setting up filepaths for {floating_ip}") sftp = client.open_sftp() - for localpath, remotepath in filepaths: - copy_to_server(sftp=sftp, localpath=localpath, remotepath=remotepath) - LOG.debug("SFTP: Files %s copied.", filepaths) + for local_path, remote_path in filepaths: + copy_to_server(sftp=sftp, local_path=local_path, remote_path=remote_path, log=log) + log.debug("SFTP: Files %s copied.", filepaths) if commands: - LOG.debug(f"Setting up commands for {floating_ip}") - execute_ssh_cml_commands(client, commands) + log.debug(f"Setting up commands for {floating_ip}") + execute_ssh_cml_commands(client=client, commands=commands, log=log) diff --git a/bibigrid/core/utility/image_selection.py b/bibigrid/core/utility/image_selection.py new file mode 100644 index 00000000..8bc15387 --- /dev/null +++ b/bibigrid/core/utility/image_selection.py @@ -0,0 +1,30 @@ +""" +Methods for image selection +""" +import difflib +import re + +from bibigrid.models.exceptions import ImageNotActiveException + + +def select_image(provider, image, log, fallback=None): + # check if image is active + active_images = provider.get_active_images() + if image not in active_images: + old_image = image + log.info(f"Image '{old_image}' has no direct match. Maybe it's a regex? Trying regex match.") + image = next((elem for elem in active_images if re.match(image, elem)), None) + if not image: + log.warning(f"Couldn't find image '{old_image}'.") + if isinstance(fallback, str): + image = next(elem for elem in active_images if re.match(fallback, elem)) + log.info(f"Taking first regex ('{fallback}') match '{image}'.") + elif fallback: + image = difflib.get_close_matches(old_image, active_images)[0] + log.info(f"Taking closest active image (in name) '{image}' instead.") + else: + raise ImageNotActiveException(f"Image {old_image} no longer active or doesn't exist.") + log.info(f"Using alternative '{image}' instead of '{old_image}'. You should change the configuration.") + else: + log.info(f"Taking first regex match: '{image}'") + return image diff --git a/bibigrid/core/utility/validate_configuration.py b/bibigrid/core/utility/validate_configuration.py index 56037842..72deb806 100644 --- a/bibigrid/core/utility/validate_configuration.py +++ b/bibigrid/core/utility/validate_configuration.py @@ -2,38 +2,40 @@ Validates configuration and cloud_specification """ -import logging import os +from bibigrid.core.utility import image_selection from bibigrid.core.utility.handler import configuration_handler +from bibigrid.models.exceptions import ImageNotActiveException ACCEPTED_KEY_IDENTIFIERS = {"RSA": 4096, "ECDSA": 521, "ED25519": 256} -LOG = logging.getLogger("bibigrid") -def evaluate(check_name, check_result): +def evaluate(check_name, check_result, log): """ Logs check_result as warning if failed and as success if succeeded. :param check_name: :param check_result: + :param log: :return: """ if check_result: - LOG.info("Checking %s: Success", check_name) + log.info("Checking %s: Success", check_name) else: - LOG.warning("Checking %s: Failure", check_name) + log.warning("Checking %s: Failure", check_name) return check_result -def check_provider_data(provider_data_list, provider_count): +def check_provider_data(provider_data_list, provider_count, log): """ Checks if all provider datas are unique and if enough providers are given #ToDo for multiple cloud locations additional provider data needs to be added :param provider_data_list: list of all provider data :param provider_count: number of providers + :param log: :return: True if enough providers are given and all providers are unique """ - LOG.info("Checking provider names") + log.info("Checking provider names") success = True duplicates = [] seen = [] @@ -43,23 +45,24 @@ def check_provider_data(provider_data_list, provider_count): else: seen.append(elem) if duplicates: - LOG.warning("Duplicate provider(s) %s. For each provider you can only create one configuration. " + log.warning("Duplicate provider(s) %s. For each provider you can only create one configuration. " "Please check your configurations.", duplicates) success = False else: - LOG.info("All providers are unique.") + log.info("All providers are unique.") if not len(provider_data_list) == provider_count: - LOG.warning("Not enough providers given. %s/%s", len(provider_data_list), provider_count) + log.warning("Not enough providers given. %s/%s", len(provider_data_list), provider_count) success = False else: - LOG.info("Enough providers given. %s/%s", len(provider_data_list), provider_count) + log.info("Enough providers given. %s/%s", len(provider_data_list), provider_count) return success -def evaluate_ssh_public_key_file_security(ssh_public_key_file): +def evaluate_ssh_public_key_file_security(ssh_public_key_file, log): """ Checks if key encryption is sufficiently strong. Uses empiric values and therefore will fail if key type is unknown @param ssh_public_key_file: + @param log: @return: """ success = True @@ -70,64 +73,67 @@ def evaluate_ssh_public_key_file_security(ssh_public_key_file): minimum_size = ACCEPTED_KEY_IDENTIFIERS.get(identifier_clean) if not minimum_size: - LOG.warning("sshPublicKey '%s' is %s. Which secure length is unknown to bibigrid.\n" - "Known encryptions are (with minimum size): %s", - ssh_public_key_file, identifier_clean, ACCEPTED_KEY_IDENTIFIERS) + log.warning("sshPublicKey '%s' is %s. Which secure length is unknown to bibigrid.\n" + "Known encryptions are (with minimum size): %s", ssh_public_key_file, identifier_clean, + ACCEPTED_KEY_IDENTIFIERS) else: - LOG.info("sshPublicKey '%s' is a known encryption.", ssh_public_key_file) + log.info("sshPublicKey '%s' is a known encryption.", ssh_public_key_file) if minimum_size > int(length): - LOG.warning("sshPublicKey '%s' is not long enough! %s should be >= %s, but is %s", - ssh_public_key_file, identifier_clean, minimum_size, int(length)) + log.warning("sshPublicKey '%s' is not long enough! %s should be >= %s, but is %s", ssh_public_key_file, + identifier_clean, minimum_size, int(length)) else: - LOG.info("sshPublicKey '%s' is long enough (%s/%s)!", ssh_public_key_file, int(length), minimum_size) + log.info("sshPublicKey '%s' is long enough (%s/%s)!", ssh_public_key_file, int(length), minimum_size) return success -def has_enough(maximum, needed, keeper, thing): +def has_enough(maximum, needed, keeper, thing, log): """ Method logs and compares whether enough free things are available :param maximum: maximum (available) resources of thing :param needed: minimum needed to run :param keeper: description of the object having the thing that is checked (for logging) :param thing: description of what resource is checked (RAM for example) (for logging) + :param log: :return: True if maximum is larger or equal to the needed """ success = True if maximum >= needed: - LOG.info("%s has enough %s: %s/%s", keeper, thing, needed, maximum) + log.info("%s has enough %s: %s/%s", keeper, thing, needed, maximum) elif maximum < 0: - LOG.warning("%s returns no valid value for %s: %s/%s -- Ignored.", keeper, thing, needed, maximum) + log.warning("%s returns no valid value for %s: %s/%s -- Ignored.", keeper, thing, needed, maximum) else: - LOG.warning("%s has not enough %s: %s/%s", keeper, thing, needed, maximum) + log.warning("%s has not enough %s: %s/%s", keeper, thing, needed, maximum) success = False return success -def check_clouds_yaml_security(): +def check_clouds_yaml_security(log): """ Checks security of all clouds in clouds.yaml i.e. whether sensitive information is stored in clouds-public.yaml + @param log: @return: True if no sensitive information is stored in clouds-public.yaml. False else. """ success = True - LOG.info("Checking validity of entire clouds.yaml and clouds-public.yaml") - clouds, clouds_public = configuration_handler.get_clouds_files() # pylint: disable=unused-variable + log.info("Checking validity of entire clouds.yaml and clouds-public.yaml") + clouds, clouds_public = configuration_handler.get_clouds_files(log) # pylint: disable=unused-variable if clouds_public: for cloud in clouds_public: if clouds_public[cloud].get("profile"): - LOG.warning(f"{cloud}: Profiles should be placed in clouds.yaml not clouds-public.yaml!") + log.warning(f"{cloud}: Profiles should be placed in clouds.yaml not clouds-public.yaml!") success = False if clouds_public[cloud].get("auth"): for key in ["password", "username", "application_credential_id", "application_credential_secret"]: if clouds_public[cloud]["auth"].get(key): - LOG.warning(f"{cloud}: {key} shouldn't be shared. Move {key} to clouds.yaml!") + log.warning(f"{cloud}: {key} shouldn't be shared. Move {key} to clouds.yaml!") success = False return success -def check_cloud_yaml(cloud_specification): +def check_cloud_yaml(cloud_specification, log): """ Check if cloud_specification is valid i.e. contains the necessary authentification data. @param cloud_specification: dict to check whether it is a valid cloud_specification + @param log @return: True if cloud_specification is valid. False else. """ success = True @@ -136,24 +142,24 @@ def check_cloud_yaml(cloud_specification): auth = cloud_specification.get("auth") if auth: auth_keys = auth.keys() - if not ("password" in auth_keys and "username" in auth_keys) \ - and not ("auth_type" in keys and "application_credential_id" in auth_keys and - "application_credential_secret" in auth_keys): - LOG.warning("Insufficient authentication information. Needs either password and username or " + if not ("password" in auth_keys and "username" in auth_keys) and not ( + "auth_type" in keys and "application_credential_id" in auth_keys and + "application_credential_secret" in auth_keys): + log.warning("Insufficient authentication information. Needs either password and username or " "if using application credentials: " "auth_type, application_credential_id and application_credential_secret.") success = False if "auth_url" not in auth_keys: - LOG.warning("Authentification URL auth_url is missing.") + log.warning("Authentification URL auth_url is missing.") success = False else: - LOG.warning("Missing all auth information!") + log.warning("Missing all auth information!") success = False if "region_name" not in keys: - LOG.warning("region_name is missing.") + log.warning("region_name is missing.") success = False else: - LOG.warning("Missing all cloud_specification information!") + log.warning("Missing all cloud_specification information!") return success @@ -162,7 +168,7 @@ class ValidateConfiguration: This class contains necessary algorithms to validate configuration files """ - def __init__(self, configurations, providers): + def __init__(self, configurations, providers, log): """ Sets configurations, providers and prepares the required_resources_dict. While executing the checks, needed resources are counted. @@ -170,6 +176,7 @@ def __init__(self, configurations, providers): :param configurations: List of configurations (dicts) :param providers: List of providers """ + self.log = log self.configurations = configurations self.providers = providers self.required_resources_dict = {'total_cores': 0, 'floating_ips': 0, 'instances': 0, 'total_ram': 0, @@ -190,7 +197,7 @@ def validate(self): :return: """ success = bool(self.providers) - LOG.info("Validating config file...") + self.log.info("Validating config file...") # success = check_provider_data( # configuration_handler.get_list_by_key(self.configurations, "cloud"), # len(self.configurations)) and success @@ -198,13 +205,12 @@ def validate(self): # LOG.warning("Providers not set correctly in configuration file. Check log for more detail.") # return success checks = [("master/vpn", self.check_master_vpn_worker), ("servergroup", self.check_server_group), - ("instances", self.check_instances), ("volumes", self.check_volumes), - ("network", self.check_network), ("quotas", self.check_quotas), - ("sshPublicKeyFiles", self.check_ssh_public_key_files), ("cloudYamls", self.check_clouds_yamls), - ("nfs", self.check_nfs)] + ("instances", self.check_instances), ("volumes", self.check_volumes), ("network", self.check_network), + ("quotas", self.check_quotas), ("sshPublicKeyFiles", self.check_ssh_public_key_files), + ("cloudYamls", self.check_clouds_yamls), ("nfs", self.check_nfs)] if success: for check_name, check_function in checks: - success = evaluate(check_name, check_function()) and success + success = evaluate(check_name, check_function(), self.log) and success return success def check_master_vpn_worker(self): @@ -214,7 +220,7 @@ def check_master_vpn_worker(self): If one is missing said provider wouldn't be reachable over the cluster, because no floating IP would be given. :return: True if first configuration has a masterInstance and every other a vpnInstance """ - LOG.info("Checking master/vpn") + self.log.info("Checking master/vpn") success = True if not self.configurations[0].get("masterInstance") or self.configurations[0].get("vpnInstance"): success = False @@ -234,8 +240,8 @@ def check_provider_connections(self): if not provider.conn: providers_unconnectable.append(provider.cloud_specification["identifier"]) if providers_unconnectable: - LOG.warning("API connection to %s not successful. Please check your configuration.", - providers_unconnectable) + self.log.warning("API connection to %s not successful. Please check your configuration.", + providers_unconnectable) success = False return success @@ -244,21 +250,21 @@ def check_instances(self): Checks if all instances exist and image and instance-type are compatible :return: true if image and instance-type (flavor) exist for all instances and are compatible """ - LOG.info("Checking instance images and type") + self.log.info("Checking instance images and type") success = True configuration = None try: for configuration, provider in zip(self.configurations, self.providers): self.required_resources_dict["floating_ips"] += 1 if configuration.get("masterInstance"): - success = self.check_instance("masterInstance", configuration["masterInstance"], provider) \ - and success + success = self.check_instance("masterInstance", configuration["masterInstance"], + provider) and success else: success = self.check_instance("vpnInstance", configuration["vpnInstance"], provider) and success for worker in configuration.get("workerInstances", []): success = self.check_instance("workerInstance", worker, provider) and success except KeyError as exc: - LOG.warning("Not found %s, but required in configuration %s.", str(exc), configuration) + self.log.warning("Not found %s, but required in configuration %s.", str(exc), configuration) success = False return success @@ -272,19 +278,16 @@ def check_instance(self, instance_name, instance, provider): """ self.required_resources_dict["instances"] += instance.get("count") or 1 instance_image_id_or_name = instance["image"] - instance_image = provider.get_image_by_id_or_name(image_id_or_name=instance_image_id_or_name) - if not instance_image: - LOG.warning("Instance %s image: %s not found", instance_name, instance_image_id_or_name) - print("Available active images:") - print("\n".join(provider.get_active_images())) - return False - if instance_image["status"] != "active": - LOG.warning("Instance %s image: %s not active", instance_name, instance_image_id_or_name) - print("Available active images:") - print("\n".join(provider.get_active_images())) + try: + instance_image = image_selection.select_image(provider, instance_image_id_or_name, self.log) + self.log.info("Instance %s image: %s found", instance_name, instance_image_id_or_name) + instance_type = instance["type"] + except ImageNotActiveException: + self.log.warning("Instance %s image: %s not found among active images.", + instance_name, instance_image_id_or_name) + self.log.log(42, "Available active images:") + self.log.log(42, "\n".join(provider.get_active_images())) return False - LOG.info("Instance %s image: %s found", instance_name, instance_image_id_or_name) - instance_type = instance["type"] return self.check_instance_type_image_combination(instance_type, instance_image, provider) def check_instance_type_image_combination(self, instance_type, instance_image, provider): @@ -299,7 +302,7 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p # check flavor = provider.get_flavor(instance_type) if not flavor: - LOG.warning("Flavor %s does not exist.", instance_type) + self.log.warning("Flavor %s does not exist.", instance_type) print("Available flavors:") print("\n".join(provider.get_active_flavors())) return False @@ -309,7 +312,7 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p image_min_ram = provider.get_image_by_id_or_name(instance_image)["min_ram"] for maximum, needed, thing in [(type_max_disk_space, image_min_disk_space, "disk space"), (type_max_ram, image_min_ram, "ram")]: - success = has_enough(maximum, needed, f"Type {instance_type}", thing) and success + success = has_enough(maximum, needed, f"Type {instance_type}", thing, self.log) and success # prepare check quotas self.required_resources_dict["total_ram"] += type_max_ram self.required_resources_dict["total_cores"] += flavor["vcpus"] @@ -320,7 +323,7 @@ def check_volumes(self): Checking if volume or snapshot exists for all volumes :return: True if all snapshot and volumes are found. Else false. """ - LOG.info("Checking volumes...") + self.log.info("Checking volumes...") success = True for configuration, provider in zip(self.configurations, self.providers): volume_identifiers = configuration.get("masterMounts") @@ -335,14 +338,14 @@ def check_volumes(self): if not volume: snapshot = provider.get_volume_snapshot_by_id_or_name(volume_name_or_id) if not snapshot: - LOG.warning("Neither Volume nor Snapshot '%s' found", volume_name_or_id) + self.log.warning("Neither Volume nor Snapshot '%s' found", volume_name_or_id) success = False else: - LOG.info("Snapshot '%s' found", volume_name_or_id) + self.log.info("Snapshot '%s' found", volume_name_or_id) self.required_resources_dict["Volumes"] += 1 self.required_resources_dict["VolumeGigabytes"] += snapshot["size"] else: - LOG.info(f"Volume '{volume_name_or_id}' found") + self.log.info(f"Volume '{volume_name_or_id}' found") return success def check_network(self): @@ -350,25 +353,25 @@ def check_network(self): Check if network (or subnet) is accessible :return True if any given network or subnet is accessible by provider """ - LOG.info("Checking network...") + self.log.info("Checking network...") success = True for configuration, provider in zip(self.configurations, self.providers): network_name_or_id = configuration.get("network") if network_name_or_id: network = provider.get_network_by_id_or_name(network_name_or_id) if not network: - LOG.warning(f"Network '{network_name_or_id}' not found", network_name_or_id) + self.log.warning(f"Network '{network_name_or_id}' not found", network_name_or_id) success = False else: - LOG.info(f"Network '{subnet_name_or_id}' found") + self.log.info(f"Network '{network_name_or_id}' found") subnet_name_or_id = configuration.get("subnet") if subnet_name_or_id: subnet = provider.get_subnet_by_id_or_name(subnet_name_or_id) if not subnet: - LOG.warning(f"Subnet '{subnet_name_or_id}' not found") + self.log.warning(f"Subnet '{subnet_name_or_id}' not found") success = False else: - LOG.info(f"Subnet '{subnet_name_or_id}' found") + self.log.info(f"Subnet '{subnet_name_or_id}' found") return bool(success and (network_name_or_id or subnet_name_or_id)) def check_server_group(self): @@ -381,10 +384,10 @@ def check_server_group(self): if server_group_name_or_id: server_group = provider.get_server_group_by_id_or_name(server_group_name_or_id) if not server_group: - LOG.warning("ServerGroup '%s' not found", server_group_name_or_id) + self.log.warning("ServerGroup '%s' not found", server_group_name_or_id) success = False else: - LOG.info("ServerGroup '%s' found", server_group_name_or_id) + self.log.info("ServerGroup '%s' found", server_group_name_or_id) return success def check_quotas(self): @@ -396,16 +399,15 @@ def check_quotas(self): is returned to make the check not fail because of the missing API implementation. :return: True if check succeeded. Else false. """ - LOG.info("Checking quotas") + self.log.info("Checking quotas") success = True - LOG.info("required/available") + self.log.info("required/available") for provider in self.providers: free_resources_dict = provider.get_free_resources() for key, value in self.required_resources_dict.items(): - success = has_enough(free_resources_dict[key], - value, - f"Project {self.providers[0].cloud_specification['identifier']}", - key) and success + success = has_enough(free_resources_dict[key], value, + f"Project {self.providers[0].cloud_specification['identifier']}", key, + self.log) and success return success def check_ssh_public_key_files(self): @@ -417,11 +419,11 @@ def check_ssh_public_key_files(self): for configuration in self.configurations: for ssh_public_key_file in configuration.get("sshPublicKeyFiles") or []: if not os.path.isfile(ssh_public_key_file): - LOG.warning("sshPublicKeyFile '%s' not found", ssh_public_key_file) + self.log.warning("sshPublicKeyFile '%s' not found", ssh_public_key_file) success = False else: - LOG.info("sshPublicKeyFile '%s' found", ssh_public_key_file) - success = evaluate_ssh_public_key_file_security(ssh_public_key_file) and success + self.log.info("sshPublicKeyFile '%s' found", ssh_public_key_file) + success = evaluate_ssh_public_key_file_security(ssh_public_key_file, self.log) and success return success def check_clouds_yamls(self): @@ -429,14 +431,15 @@ def check_clouds_yamls(self): Checks if every cloud in clouds_yaml is valid @return: True if all clouds are valid """ - LOG.info("Checking cloud specifications...") + self.log.info("Checking cloud specifications...") success = True - cloud_specifications = configuration_handler.get_cloud_specifications(self.configurations) + cloud_specifications = configuration_handler.get_cloud_specifications(self.configurations, self.log) for index, cloud_specification in enumerate(cloud_specifications): - if not check_cloud_yaml(cloud_specification): + if not check_cloud_yaml(cloud_specification, self.log): success = False - LOG.warning("Cloud specification %s is faulty. BiBiGrid understood %s.", index, cloud_specification) - success = check_clouds_yaml_security() and success + self.log.warning("Cloud specification %s is faulty. BiBiGrid understood %s.", index, + cloud_specification) + success = check_clouds_yaml_security(self.log) and success return success def check_nfs(self): @@ -444,14 +447,14 @@ def check_nfs(self): Checks whether nfsshares => nfs holds and logs if failed. Returns True in every case as it is not fatale. @return: True """ - LOG.info("Checking nfs...") + self.log.info("Checking nfs...") success = True master_configuration = self.configurations[0] nfs_shares = master_configuration.get("nfsShares") nfs = master_configuration.get("nfs") if nfs_shares and not nfs: success = False - LOG.warning("nfsShares exist, but nfs is False.") + self.log.warning("nfsShares exist, but nfs is False.") else: success = True return success diff --git a/bibigrid/models/exceptions.py b/bibigrid/models/exceptions.py index aef5b38c..22e40248 100644 --- a/bibigrid/models/exceptions.py +++ b/bibigrid/models/exceptions.py @@ -15,3 +15,15 @@ class ConfigurationException(Exception): class ConflictException(Exception): """ Conflict exception""" + + +class ImageNotActiveException(Exception): + """ Image deactivated exception""" + + +class ImageDeactivatedException(ImageNotActiveException): + """ Image deactivated exception""" + + +class ImageNotFoundException(ImageNotActiveException): + """ Image not found exception""" diff --git a/bibigrid/openstack/openstack_provider.py b/bibigrid/openstack/openstack_provider.py index d37d57d4..b65b5f62 100644 --- a/bibigrid/openstack/openstack_provider.py +++ b/bibigrid/openstack/openstack_provider.py @@ -15,7 +15,7 @@ from bibigrid.core import provider from bibigrid.core.actions import create from bibigrid.core.actions import version -from bibigrid.models.exceptions import ExecutionException, ConflictException +from bibigrid.models.exceptions import ExecutionException, ConflictException, ImageDeactivatedException LOG = logging.getLogger("bibigrid") @@ -117,6 +117,8 @@ def create_server(self, name, flavor, image, network, key_name=None, wait=True, server = self.conn.create_server(name=name, flavor=flavor, image=image, network=network, key_name=key_name, volumes=volumes, security_groups=security_groups) except openstack.exceptions.BadRequestException as exc: + if "is not active" in str(exc): + raise ImageDeactivatedException() from exc raise ConnectionError() from exc except openstack.exceptions.SDKException as exc: raise ExecutionException() from exc diff --git a/bibigrid_rest.sh b/bibigrid_rest.sh new file mode 100755 index 00000000..856885aa --- /dev/null +++ b/bibigrid_rest.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python3 -m bibigrid.core.startup_rest "$@" \ No newline at end of file diff --git a/documentation/markdown/features/check.md b/documentation/markdown/features/check.md index 4ff4d5db..2f411d5d 100644 --- a/documentation/markdown/features/check.md +++ b/documentation/markdown/features/check.md @@ -2,7 +2,7 @@ ## Exactly one master or vpn instance per configuration -There can only be a single master or a single vpn-gateway per configuration. +There can only be a single master or a single vpn-gateway (vpngtw) per configuration. ## Given Server group exist diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index 9237f4c4..b327f2ba 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -44,6 +44,13 @@ sshPublicKeyFiles: - /home/user/.ssh/id_ecdsa_colleague.pub ``` +#### autoMount (optional) +> **Warning:** If a volume has an obscure filesystem, this might overwrite your data! + +If `True` all [masterMounts](#mastermounts-optional) will be automatically mounted by BiBiGrid if possible. +If a volume is not formatted or has an unknown filesystem, it will be formatted to `ext4`. +Default `False`. + #### masterMounts (optional) `masterMounts` expects a list of volumes and snapshots. Those will be attached to the master. If any snapshots are @@ -131,6 +138,10 @@ After creation connection information is [printed](../features/create.md#prints- If `False`, master will no longer help workers to process jobs. Default is `True`. +#### useMasterWithPublicIP (optional) + +If `False`, master will not be created with an attached floating ip. Default is `True`. + #### waitForServices (optional): Expects a list of services to wait for. @@ -138,6 +149,24 @@ This is required if your provider has any post-launch services interfering with seemingly random errors can occur when the service interrupts ansible's execution. Services are listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific` (not yet). +#### +In order to save valuable floating ips, BiBiGrid can also make use of a gateway to create the cluster. +For more information on how to set up a gateway, how gateways work and why they save floating ips please continue reading [here](https://cloud.denbi.de/wiki/Tutorials/SaveFloatingIPs/). + +BiBiGrid needs the gateway-ip and a function that maps ips of nodes behind the gateway (private nodes) to the port over which you can connect to said node over the gateway. + +In the example below the gateway-ip is 123.123.123.42 (ip of the gateway node) and the port function is 30000 + oct4. +Hereby, Oct4 stands for the fourth octet of the private node's ip (the last element). You can use your own custom port function +using all octets if needed.
+A private node with ip "123.123.123.12" is reachable over 123.123.123.42:30012 (because the fourth octet is 12). +```yaml +gateway: + ip: 123.123.123.42 # IP of gateway to use + portFunction: 30000 + oct4 # variables are called: oct1.oct2.oct3.oct4 +``` + +Using gateway also automatically sets [useMasterWithPublicIp](#usemasterwithpublicip-optional) to `False`. + ### Local #### infrastructure (required) @@ -174,6 +203,16 @@ openstack image list --os-cloud=openstack | grep active Currently, images based on Ubuntu 20.04/22.04 (Focal/Jammy) and Debian 11(Bullseye) are supported. +###### Using Regex +Instead of using a specific image you can also provide a regex. +For example if your images are named by following the pattern `Ubuntu 22.04 LTS ($DATE)` and on ly the +most recent release is active, you can use `Ubuntu 22.04 LTS \(.*\)` so it always picks the right one. + +This regex will also be used when starting worker instances on demand +and is therefore mandatory to automatically resolve image updates of the described kind while running a cluster. + +There's also a [Fallback Option](#fallbackonotherimage-optional). + ##### Find your active `type`s `flavor` is just the OpenStack terminology for `type`. @@ -218,7 +257,7 @@ You can create features for the master [in the same way](#features-optional) as ```yaml masterInstance: type: de.NBI tiny - image: Ubuntu 22.04 LTS (2022-10-14) + image: Ubuntu 22.04 LTS (2022-10-14) # regex allowed features: - hasdatabase - holdsinformation @@ -231,9 +270,22 @@ Exactly one in every configuration but the first: ```yaml vpngtw: type: de.NBI tiny - image: Ubuntu 22.04 LTS (2022-10-14) + image: Ubuntu 22.04 LTS (2022-10-14) # regex allowed ``` +### fallbackOnOtherImage (optional) +If set to `true` and an image is not among the active images, +BiBiGrid will try to pick a fallback image for you by finding the closest active image by name that has at least 60% name overlap. +This will not find a good fallback every time. + +You can also set `fallbackOnOtherImage` to a regex like `Ubuntu 22.04 LTS \(.*\)` in which case BiBiGrid will pick an +active image matching that regex. +This can be combined with the regular regex option from the [image key](#find-your-active-images). +In that case the fallback regex should be more open to be still useful when the original regex failed to find an active image. + +This fallback will also be used when starting worker instances on demand +and can be helpful to when image updates occur while running a cluster. + #### sshUser (required) `sshUser` is the standard user of the installed images. For `Ubuntu 22.04` this would be `ubuntu`. @@ -273,7 +325,7 @@ openstack subnet list --os-cloud=openstack #### localDNSLookup (optional) If no full DNS service for started instances is available, set `localDNSLookup: True`. -Currently the case in Berlin, DKFZ, Heidelberg and Tuebingen. +Currently, the case in Berlin, DKFZ, Heidelberg and Tuebingen. #### features (optional) diff --git a/requirements-rest.txt b/requirements-rest.txt new file mode 100644 index 00000000..57bc62e8 --- /dev/null +++ b/requirements-rest.txt @@ -0,0 +1,14 @@ +openstacksdk==0.62 +mergedeep +paramiko +python-cinderclient +python-keystoneclient +python-novaclient +python-openstackclient==6.0.0 +PyYAML +shortuuid +sshtunnel +fastapi +python-multipart +uvicorn +httpx \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a5b5e1bf..d21a3828 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ python-novaclient python-openstackclient==6.0.0 PyYAML shortuuid -sshtunnel \ No newline at end of file +sshtunnel +sympy +seedir \ No newline at end of file diff --git a/resources/bin/binfo b/resources/bin/binfo new file mode 100644 index 00000000..e7b9555a --- /dev/null +++ b/resources/bin/binfo @@ -0,0 +1,3 @@ +#!/bin/bash +# shows available features +sinfo -o "%20n %20P %10c %10m %25f" \ No newline at end of file diff --git a/resources/playbook/roles/bibigrid/files/slurm/create_server.py b/resources/playbook/roles/bibigrid/files/slurm/create_server.py index db84188d..13216619 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/create_server.py +++ b/resources/playbook/roles/bibigrid/files/slurm/create_server.py @@ -3,9 +3,11 @@ Creates one or more instances from comma separated name list. Is called automatically by create.sh (called by slurm user automatically) which sources a virtual environment. """ +import difflib import logging import math import os +import re import subprocess import sys import threading @@ -17,6 +19,11 @@ import yaml from openstack.exceptions import OpenStackCloudException + +class ImageNotFoundException(Exception): + """ Image not found exception""" + + LOGGER_FORMAT = "%(asctime)s [%(levelname)s] %(message)s" logging.basicConfig(format=LOGGER_FORMAT, filename="/var/log/slurm/create_server.log", level=logging.INFO) HOSTS_FILE_PATH = "/opt/playbook/vars/hosts.yml" @@ -32,6 +39,31 @@ logging.info("Starting instances %s", start_workers) +def select_image(start_worker_group, connection): + image = start_worker_group["image"] + # check if image is active + active_images = [img["name"] for img in connection.image.images() if img["status"].lower() == "active"] + if image not in active_images: + old_image = image + logging.info(f"Image '{old_image}' has no direct match. Maybe it's a regex? Trying regex match.") + image = next((elem for elem in active_images if re.match(image, elem)), None) + if not image: + logging.warning(f"Couldn't find image '{old_image}'.") + if isinstance(start_worker_group.get("fallback_on_other_image"), str): + image = next( + elem for elem in active_images if re.match(start_worker_group["fallback_on_other_image"], elem)) + logging.info(f"Taking first regex ('{start_worker_group['fallback_on_other_image']}') match '{image}'.") + elif start_worker_group.get("fallback_on_other_image", False): + image = difflib.get_close_matches(old_image, active_images)[0] + logging.info(f"Taking closest active image (in name) '{image}' instead.") + else: + raise ImageNotFoundException(f"Image {old_image} no longer active or doesn't exist.") + logging.info(f"Using alternative '{image}' instead of '{old_image}'. You should change the configuration.") + else: + logging.info(f"Taking first regex match: '{image}'") + return image + + def start_server(worker, start_worker_group, start_data): try: logging.info("Create server %s.", worker) @@ -43,8 +75,8 @@ def start_server(worker, start_worker_group, start_data): with open(userdata_file_path, mode="r", encoding="utf-8") as userdata_file: userdata = userdata_file.read() # create server and ... - server = connection.create_server(name=worker, flavor=start_worker_group["flavor"]["name"], - image=start_worker_group["image"], + image = select_image(start_worker_group, connection) + server = connection.create_server(name=worker, flavor=start_worker_group["flavor"]["name"], image=image, network=start_worker_group["network"], key_name=f"tempKey_bibi-{common_config['cluster_id']}", security_groups=[f"default-{common_config['cluster_id']}"], userdata=userdata, @@ -191,13 +223,13 @@ def _run_playbook(cmdline_args): for worker_group in worker_groups: for start_worker in start_workers: # start all servers that are part of the current worker group - result = subprocess.run(["scontrol", "show", "hostname", worker_group["name"]], - stdout=subprocess.PIPE, check=True) # get all workers in worker_type + result = subprocess.run(["scontrol", "show", "hostname", worker_group["name"]], stdout=subprocess.PIPE, + check=True) # get all workers in worker_type possible_workers = result.stdout.decode("utf-8").strip().split("\n") if start_worker in possible_workers: - start_worker_thread = threading.Thread(target=start_server, kwargs={"worker": start_worker, - "start_worker_group": worker_group, - "start_data": server_start_data}) + start_worker_thread = threading.Thread(target=start_server, + kwargs={"worker": start_worker, "start_worker_group": worker_group, + "start_data": server_start_data}) start_worker_thread.start() start_server_threads.append(start_worker_thread) @@ -206,7 +238,7 @@ def _run_playbook(cmdline_args): # If no suitable server can be started: abort if len(server_start_data["available_servers"]) == 0: - logging.warning("No suitable server found! Abort!") + logging.warning("Couldn't make server available! Abort!") sys.exit(1) # run ansible on master node to configure dns diff --git a/resources/playbook/roles/bibigrid/tasks/000-add-ip-routes.yml b/resources/playbook/roles/bibigrid/tasks/000-add-ip-routes.yml index 2df18554..3a1ef202 100644 --- a/resources/playbook/roles/bibigrid/tasks/000-add-ip-routes.yml +++ b/resources/playbook/roles/bibigrid/tasks/000-add-ip-routes.yml @@ -1,15 +1,15 @@ -- name: disable netplan configuration files +- name: Disable netplan configuration files notify: - netplan apply block: - - name: collect files + - name: Collect files find: paths: /etc/netplan/ hidden: true recurse: true file_type: any register: collected_files - - name: copy files + - name: Copy files copy: src: "{{ item.path }}" dest: "{{ item.path }}.disabled" @@ -17,13 +17,13 @@ group: root mode: 0644 with_items: "{{ collected_files.files }}" - - name: remove collected files + - name: Remove collected files file: path: "{{ item.path }}" state: absent with_items: "{{ collected_files.files }}" -- name: disable cloud network changes after initialization +- name: Disable cloud network changes after initialization lineinfile: path: /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg line: "network: {config: disabled}" @@ -32,7 +32,7 @@ mode: 0644 create: true -- name: generate location specific worker userdata +- name: Generate location specific worker userdata template: src: networking/bibigrid_ens3.network.j2 dest: "/etc/systemd/network/bibigrid_ens3.network" @@ -43,7 +43,7 @@ notify: - systemd-networkd restart -- name: generate location specific worker userdata +- name: Generate location specific worker userdata template: src: networking/bibigrid_ens3.link.j2 dest: "/etc/systemd/network/bibigrid_ens3.link" diff --git a/resources/playbook/roles/bibigrid/tasks/003-dns.yml b/resources/playbook/roles/bibigrid/tasks/003-dns.yml index 189ef1a4..6bb075c2 100644 --- a/resources/playbook/roles/bibigrid/tasks/003-dns.yml +++ b/resources/playbook/roles/bibigrid/tasks/003-dns.yml @@ -18,18 +18,21 @@ template: src: dns/hosts.j2 dest: /etc/dnsmasq.hosts + mode: '0644' notify: dnsmasq - name: Adjust dnsmasq.resolv.conf template: src: dns/resolv.conf.j2 dest: /etc/dnsmasq.resolv.conf + mode: '0644' notify: dnsmasq - name: Adjust dnsmasq conf template: src: dns/dnsmasq.conf.j2 dest: /etc/dnsmasq.conf + mode: '0644' notify: dnsmasq - name: Flush handlers diff --git a/resources/playbook/roles/bibigrid/tasks/010-bin-server.yml b/resources/playbook/roles/bibigrid/tasks/010-bin-server.yml index b62b46d2..c6716010 100644 --- a/resources/playbook/roles/bibigrid/tasks/010-bin-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/010-bin-server.yml @@ -20,7 +20,7 @@ path: ~{{ ansible_facts.env.SUDO_USER }}/bin state: absent -- name: generate bibiname script +- name: Generate bibiname script template: src: "bin/bibiname.j2" dest: "/usr/local/bin/bibiname" diff --git a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml index 17bc4b51..3bc06ebf 100644 --- a/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/020-disk-server.yml @@ -16,3 +16,30 @@ with_items: - "{{ master.disks }}" when: master.disks is defined + +- block: + - name: Make sure disks are available + filesystem: + fstype: ext4 + dev: "{{ item.device }}" + force: false + state: present + with_items: "{{ volumes }}" + + - name: Create mount folders if they don't exist + file: + path: "/{{ item.name }}" + state: directory + mode: '0755' + owner: root + group: '{{ ansible_distribution | lower }}' + with_items: "{{ volumes }}" + + - name: Mount disks + mount: + path: "{{ item.name }}" + src: "{{ item.device }}" + state: mounted + with_items: "{{ volumes }}" + when: volumes is defined and auto_mount + ignore_errors: true diff --git a/resources/playbook/roles/bibigrid/tasks/025-nfs-server.yml b/resources/playbook/roles/bibigrid/tasks/025-nfs-server.yml index 32f89491..8f42e022 100644 --- a/resources/playbook/roles/bibigrid/tasks/025-nfs-server.yml +++ b/resources/playbook/roles/bibigrid/tasks/025-nfs-server.yml @@ -20,7 +20,7 @@ regexp: '^{{ item.src }}' line: "{{ item.src }} - {{cluster_cidrs|map(attribute='provider_cidrs')|join('(rw,nohide,insecure,no_subtree_check,async) ')}}\ + {{cluster_cidrs|map(attribute='provider_cidrs')|flatten|join('(rw,nohide,insecure,no_subtree_check,async) ')}}\ (rw,nohide,insecure,no_subtree_check,async) {{ '10.0.0.0/'+wireguard.mask_bits|default(24)|string + '(rw,nohide,insecure,no_subtree_check,async)' if wireguard is defined }}" diff --git a/resources/playbook/roles/bibigrid/tasks/main.yml b/resources/playbook/roles/bibigrid/tasks/main.yml index b9e5ea06..b81dfdbc 100644 --- a/resources/playbook/roles/bibigrid/tasks/main.yml +++ b/resources/playbook/roles/bibigrid/tasks/main.yml @@ -102,9 +102,11 @@ msg: "[BIBIGRID] Generate directory structure available on all hosts" - name: Generate general directory structure available on all hosts import_tasks: 020-disk.yml + tags: ["disk"] - name: Generate server directory structure available on all hosts import_tasks: 020-disk-server.yml when: "'master' in group_names" + tags: ["disk"] - name: Generate worker directory structure available on all hosts import_tasks: 020-disk-worker.yml when: "'master' not in group_names" diff --git a/resources/playbook/roles/bibigrid/templates/bin/bibiname.j2 b/resources/playbook/roles/bibigrid/templates/bin/bibiname.j2 index b3bb5ff3..a7b2434d 100644 --- a/resources/playbook/roles/bibigrid/templates/bin/bibiname.j2 +++ b/resources/playbook/roles/bibigrid/templates/bin/bibiname.j2 @@ -5,8 +5,8 @@ if [ "$1" == "m" ]; then elif [ "$1" == "v" ]; then name="bibigrid-vpngtw-$cid-$2" elif [ "$1" == "w" ]; then - name="bibigrid-worker$2-$cid-$3" + name="bibigrid-worker-$cid-$2" else - name="bibigrid-worker$1-$cid-$2" + name="bibigrid-worker-$cid-$1" fi echo "$name"