diff --git a/dashboard.yaml b/dashboard.yaml index 54ba4fe..31be9f7 100644 --- a/dashboard.yaml +++ b/dashboard.yaml @@ -98,7 +98,7 @@ dashboard: type: asciiGraph # || progressBar || asciiText || markdown || markdown Table ||Table metricUnit: kb # byte_to_kb_mb_gb_tb # dynamic_byte_convert metric: > - topk(20, sum(irate(kafka_server_brokertopicmetrics_bytesin_total{topic=~"$topic"}[5m])) by (strimzi_io_cluster, topic)) / 1024 + sort_desc(sum(irate(kafka_server_brokertopicmetrics_bytesin_total{topic=~"$topic"}[5m])) by (strimzi_io_cluster, topic)) / 1024 custom_key: "🍅 {{topic}}" asciiGraphOptions: height: 0 @@ -116,7 +116,7 @@ dashboard: type: asciiGraph # || progressBarList || asciiText metricUnit: kb metric: > - topk(20, sum(irate(kafka_server_brokertopicmetrics_bytesout_total{topic=~"$topic"}[5m])) by (strimzi_io_cluster, topic)) / 1024 + sort_desc(sum(irate(kafka_server_brokertopicmetrics_bytesout_total{topic=~"$topic"}[5m])) by (strimzi_io_cluster, topic)) / 1024 custom_key: "🥕 {{topic}}" asciiGraphOptions: height: 0 diff --git a/kubePtop/ascii_graph.py b/kubePtop/ascii_graph.py index dd596fb..bb4cd0e 100644 --- a/kubePtop/ascii_graph.py +++ b/kubePtop/ascii_graph.py @@ -312,7 +312,6 @@ def create_graph(self, names=[], height=17, width=45, max_height=20, max_width=5 else: self.width = width - self.names = names # if len(names) > len(self.colors): diff --git a/kubePtop/cli.py b/kubePtop/cli.py deleted file mode 100644 index f3fd12c..0000000 --- a/kubePtop/cli.py +++ /dev/null @@ -1,223 +0,0 @@ -from kubePtop.read_env import ReadEnv -# Read environment variables -read_environment_variables = ReadEnv() -read_environment_variables.read_env() -import argparse -from kubePtop.session import PrometheusAPI -from kubePtop.node_monitor import Node_Monitoring -from kubePtop.pod_monitor import Pod_Monitoring -from kubePtop.node_metrics import PrometheusNodeMetrics -from kubePtop.pod_metrics import PrometheusPodsMetrics -import rich - -node_monitor = Node_Monitoring() -pod_monitor = Pod_Monitoring() -pod_metrics = PrometheusPodsMetrics() -node_metrics = PrometheusNodeMetrics() -prometheus_api = PrometheusAPI() -from kubePtop.logging import Logging -from kubePtop.global_attrs import GlobalAttrs - - - -class Cli(): - def __init__(self): - self.parser = None - # CLI Input attributes - # self.verify_prometheus = False - self.list_pvcs = False - self.list_nodes = False - self.node = None - self.list_pods = False - self.pod = None - self.container = None - self.namespace = "default" - self.all_namespaces = False - self.debug = False - self.dashboard = 'default' - self.list_dashboards = False - self.sort_by_mem_usage = False - self.list_option = '' - self.list_nodes_option = [] - self.colorize_json = False - self.dashboard = None - - - # Read CLI arguments - self.argparse() - - - if self.debug: - GlobalAttrs.debug = True - Logging.log.setLevel(level="DEBUG") - - if self.dashboard: - rich.print("Testing - Custom Dashboard") - exit(1) - - - - # kptop nodes - if self.node: - if self.list_dashboards: - node_monitor.list_dashboards() - exit(0) - - # kptop nodes -o json - if self.list_option == 'json': - node_metrics.topNodeJson(node=self.node, color=self.colorize_json) - exit(0) - # Check if the node found. - node_monitor.display_dashboard(dashboard=self.dashboard, node_name=self.node) - - # kptop nodes - if self.list_nodes: - # kptop nodes -o json - if self.list_option == 'json': - node_metrics.topNodeJson(node=".*", color=self.colorize_json) - exit(0) - node_metrics.topNodeTable(option=self.list_option) - exit(0) - - - # kptop pods - if self.pod: - if self.container is None: - self.container = ".*" - # Check if the pod found. - check_pod = pod_metrics.podExists(pod=self.pod, namespace=self.namespace) - if not check_pod.get('result'): - print(f"pod/{self.pod} not found in the '{self.namespace}' namespace") - rich.print(f"[yellow]{check_pod.get('fail_reason')}") - exit(1) - pod_monitor.pod_monitor(pod=self.pod, namespace=self.namespace, container=self.container) - - if self.list_pods: - # kptop pods - ns = self.namespace - if self.all_namespaces: - ns = ".*" - pod_metrics.topPodTable(namespace=ns,sort_by_mem_usage=self.sort_by_mem_usage) - exit(0) - - if self.list_pvcs: - # kptop pods - ns = self.namespace - if self.all_namespaces: - ns = ".*" - pod_metrics.topPvcTable(namespace=ns) - exit(0) - - # Print help if no args are provided. - # self.parser.print_help() - - def argparse(self): - parser = argparse.ArgumentParser(description='A Python tool for Kubernetes Nodes/Pods terminal monitoring through Prometheus metrics.') - parser.add_argument('top', type=str, nargs='*', metavar='{pods, pod, po} | {nodes, node} | {persistentvolumeclaim, pvc}', help='top pods/nodes/persistentvolumeclaim') - parser.add_argument('-n', '--namespace', type=str, required=False, metavar='', help='Specify a Kubernetes namespace') - parser.add_argument('-A', '--all-namespaces', required=False, action='store_true', help='All Kubernetes namespaces') - parser.add_argument('-c', '--container', type=str, required=False, metavar='', help='Monitor a specific Pod\'s container') - parser.add_argument('-i', '--interval', type=int, required=False, metavar='', help='Live monitoring update interval') - parser.add_argument('-V', '--verify-prometheus', required=False, action='store_true', help='Verify Prometheus connection & exporters') - parser.add_argument('-C', '--check-metrics', required=False, action='store_true', help='Checks the availability of the needed metrics') - parser.add_argument('-d', '--debug', required=False, action='store_true', help='Print debug output') - parser.add_argument('-s', '--sort-by-mem-usage', required=False, action='store_true', help='Sort top result by memory usage') - parser.add_argument('-o', '--option', type=str, required=False, choices=['cloud', 'json'], help='options for "kptop node||pod" (currently supported in "kptop node")') - parser.add_argument('-cj', '--colorize-json', required=False, action='store_true', help='Colorize Json output (with "-o json")') - parser.add_argument('-D', '--dashboard', type=str, required=False, metavar='', help='Dashboard name to visualize') - # parser.add_argument('-q', '--query', type=str, required=False, help='options for "Run a custom query') - - # parser.add_argument('-D', '--dashboard', type=str, required=False, metavar='', help='Specify a dashboard') - # parser.add_argument('-L', '--list-dashboards', required=False, action='store_true', help='List available dashboards') - - pod_aliases = ['pod', 'pods', 'po'] - node_aliases = ['node', 'nodes'] - pvc_aliases = ['pvc', 'persistentvolumeclaim'] - - results = parser.parse_args() - self.parser = parser - - if results.debug: - self.debug = True - - ### kptop --verify-prometheus - if results.verify_prometheus: - prometheus_api.verify_exporters() - if results.check_metrics: - prometheus_api.check_metrics() - exit(0) - - - if len(results.top) == 0: - self.parser.print_help() - exit(1) - - ### kptop pods | nodes | pvcs - if len(results.top) == 1: - if results.top[0] in pod_aliases: - self.list_pods = True - elif results.top[0] in node_aliases: - self.list_nodes = True - elif results.top[0] in pvc_aliases: - self.list_pvcs = True - else: - rich.print(f"[bold]ERROR -- unkown argument '{results.top[0]}'\n") - self.parser.print_help() - exit(1) - - ### Example: kptop pods - if len(results.top) == 2: - if results.top[0] in pod_aliases: - self.pod = results.top[1] - elif results.top[0] in node_aliases: - self.node = results.top[1] - else: - rich.print(f"[bold]ERROR -- unkown argument '{results.top[0]}'\n") - self.parser.print_help() - exit(1) - if len(results.top) > 2: - rich.print(f"[bold]ERROR -- unkown argument '{results.top[2]}' - only 2 arguments are expected\n") - self.parser.print_help() - exit(1) - - if results.namespace and results.all_namespaces: - rich.print("[bold]ERROR -- You can only use '--all-namespaces' or '--namespace' \n") - self.parser.print_help() - exit(1) - - if results.namespace: - self.namespace = results.namespace - - if results.option: - self.list_option = results.option - - if results.colorize_json: - self.colorize_json = results.option - - if results.all_namespaces: - self.all_namespaces = results.all_namespaces - - if results.container: - self.container = results.container - - if results.interval: - GlobalAttrs.live_update_interval = results.interval - - if results.dashboard: - self.dashboard = results.dashboard - - # if results.list_dashboards: - # self.list_dashboards = True - - # if results.dashboard: - # self.dashboard = results.dashboard - - if results.sort_by_mem_usage: - self.sort_by_mem_usage = True - - - -cli = Cli() - -# def run(): -# cli = Cli() diff --git a/kubePtop/cli_args.py b/kubePtop/cli_args.py index f29a602..d2bbaa6 100644 --- a/kubePtop/cli_args.py +++ b/kubePtop/cli_args.py @@ -3,6 +3,10 @@ import rich import os import logging +from datetime import datetime +from pathlib import Path +from tabulate import tabulate +from kubePtop.global_attrs import GlobalAttrs from kubePtop.dashboard_monitor import customDashboardMonitoring from kubePtop.command_run import commandRun from kubePtop.dashboard_yaml_loader import dashboardYamlLoader @@ -35,6 +39,26 @@ def __init__(self): "description": "command name to display" } }, + { + "name": "list-dashboards", + "default": ".*", + "cliArgument": { + "enable": True, + "short": "-ld", + "required": False, + "description": "List dasboards names" + } + }, + { + "name": "list-commands", + "default": ".*", + "cliArgument": { + "enable": True, + "short": "-lc", + "required": False, + "description": "List commands names" + } + }, { "name": "vhelp", "default": ".*", @@ -49,12 +73,11 @@ def __init__(self): self.variables = {} self.build_variables() - def build_parser(self, variables): parser = argparse.ArgumentParser(description='Process some CLI arguments.') for var in variables: if var['cliArgument']['enable']: - if var['name'] == 'vhelp': + if var['name'] in ['vhelp', 'list-dashboards', 'list-commands']: parser.add_argument( f"--{var['name']}", var['cliArgument']['short'], @@ -73,6 +96,65 @@ def build_parser(self, variables): return parser + def _list_files_in_directory(self, directory_path): + out = { + "success": False, + "data": None, + "fail_reason": "" + } + try: + p = Path(directory_path) + files_info = [ + { + "name": file.stem, + "creation_date": datetime.fromtimestamp(file.stat().st_ctime).strftime('%d-%m-%Y %H:%M:%S'), + "modification_date": datetime.fromtimestamp(file.stat().st_mtime).strftime('%d-%m-%Y %H:%M:%S') + } + for file in p.iterdir() if file.is_file() and file.suffix in ['.yaml', '.yml'] + ] + if len(files_info) < 1: + out['fail_reason'] = f"No files found in '{directory_path}'" + return out + + out['data'] = files_info + out["success"] = True + return out + except FileNotFoundError as e: + out["fail_reason"] = f"Directory {directory_path} not found > {e}" + return out + + def _load_file_content(self, directory_path, file_name): + out = { + "success": False, + "data": None, + "fail_reason": "" + } + yaml_extensions = ['.yaml', '.yml'] + + file_path = None + for ext in yaml_extensions: + try: + potential_path = Path(directory_path) / (file_name + ext) + if potential_path.exists(): + file_path = potential_path + break + except Exception as e: + out["fail_reason"] = e + return out + + if not file_path: + out["fail_reason"] = f"File '{file_name}'.yml||yaml is NOT Found in {directory_path}" + return out + + try: + with open(file_path, 'r') as file: + out["data"] = file.read() + out["success"] = True + except Exception as e: + out["fail_reason"] = f"File is NOT Found" + return out + + def build_variables(self): initial_parser = self.build_parser(self.default_cli_args) # rich.print(initial_parser) @@ -83,12 +165,62 @@ def build_variables(self): initial_parser.print_help() exit(1) + if initial_args.list_dashboards and initial_args.list_commands: + rich.print("\n[yellow bold]Can NOT specify '--list-dashboards' & '--list-commands' together\n") + initial_parser.print_help() + exit(1) + + + ################### + # List Dashboards # + ################### + if initial_args.list_dashboards: + check = self._list_files_in_directory(GlobalAttrs.default_dashboards_dir) + if not check['success']: + rich.print(f"Could NOT list dashboards.\n[yellow]{check['fail_reason']}\n") + exit(1) + + table = [['DASHBOARD', 'CREATION TIME', 'UPDATE TIME']] + + for file in check['data']: + row = [file['name'], file['creation_date'], file['modification_date']] + table.append(row) + out = tabulate(table, headers='firstrow', tablefmt='plain', showindex=False) + print(out) + print() + exit(0) + + ################# + # List Commands # + ################# + if initial_args.list_commands: + check = self._list_files_in_directory(GlobalAttrs.default_commands_dir) + if not check['success']: + rich.print(f"Could NOT list commands.\n[yellow]{check['fail_reason']}\n") + exit(1) + + table = [['COMMAND', 'CREATION TIME', 'UPDATE TIME']] + + for file in check['data']: + row = [file['name'], file['creation_date'], file['modification_date']] + table.append(row) + out = tabulate(table, headers='firstrow', tablefmt='plain', showindex=False) + print(out) + print() + exit(0) + ################## # Load Dashboard # ################## elif initial_args.dashboard: - # rich.print(dashboard_yaml_loader.validate_dashboard_schema('d')) - parsed_dashboard = dashboard_yaml_loader.load_dashboard_data(dashboard_name="./dashboard.yaml") + # Load dashboard yaml file + check = self._load_file_content(GlobalAttrs.default_dashboards_dir, initial_args.dashboard) + if not check['success']: + rich.print(f"Dashboard is NOT found\n[yellow]{check['fail_reason']}\n") + exit(1) + + # Parse and validate the dashboard yaml file + parsed_dashboard = dashboard_yaml_loader.load_dashboard_data(command_content_content=check['data']) if not parsed_dashboard['success']: logging.error(f"Failed to load dashboard: '{initial_args.dashboard}'") @@ -125,10 +257,14 @@ def build_variables(self): # Load Command # ################ elif initial_args.command: - parsed_command = command_yaml_loader.load_command_data(command_name="./command.yaml") - # rich.print(parsed_command) - # exit(1) + # Load command yaml file + check = self._load_file_content(GlobalAttrs.default_commands_dir, initial_args.command) + if not check['success']: + rich.print(f"Command is NOT found\n[yellow]{check['fail_reason']}\n") + exit(1) + # Parse and validate the command yaml file + parsed_command = command_yaml_loader.load_command_data(command_content_content=check['data']) if not parsed_command['success']: logging.error(f"Failed to load command: '{initial_args.command}'") logging.error(parsed_command['fail_reason']) diff --git a/kubePtop/colors.py b/kubePtop/colors.py index e625f19..0292c3d 100644 --- a/kubePtop/colors.py +++ b/kubePtop/colors.py @@ -1,4 +1,3 @@ - class Bcolors: def __init__(self): self.HEADER = '\033[95m' @@ -10,4 +9,4 @@ def __init__(self): self.BOLD = '\033[1m' self.UNDERLINE = '\033[4m' self.GRAY = "\033[1;30;40m" - self.GRAY = "\033[90m" \ No newline at end of file + self.GRAY = "\033[90m" diff --git a/kubePtop/command_run.py b/kubePtop/command_run.py index 4a3874f..d180ce0 100644 --- a/kubePtop/command_run.py +++ b/kubePtop/command_run.py @@ -4,14 +4,14 @@ from tabulate import tabulate import logging from kubePtop.global_attrs import GlobalAttrs -from kubePtop.node_metrics import PrometheusNodeMetrics +from kubePtop.session import PrometheusAPI # from kubePtop.ascii_graph import AsciiGraph from kubePtop.helper import Helper helper_ = Helper() from kubePtop.logging import Logging -class commandRun(PrometheusNodeMetrics): +class commandRun(PrometheusAPI): def __init__(self): super().__init__() self.layout_list = [] diff --git a/kubePtop/command_yaml_loader.py b/kubePtop/command_yaml_loader.py index e3f7a39..8417dd3 100644 --- a/kubePtop/command_yaml_loader.py +++ b/kubePtop/command_yaml_loader.py @@ -551,29 +551,27 @@ def validate_command_schema(self, dashboard_yaml_data): exit(1) - def load_command_data(self, command_name): + def load_command_data(self, command_content_content): out = { "success": False, "data": None, "fail_reason": "" } - # Check if the yaml file exists in the command directory - ## If so, return the file path - ### The command dir is taken as ENV - yaml_file = command_name - # Check if the file does NOT exist - if not os.path.isfile(yaml_file): - out['fail_reason'] = f"Command File '{yaml_file}' does NOT exist" - return out + # # Check if the yaml file exists in the command directory + # ## If so, return the file path + # ### The command dir is taken as ENV + # yaml_file = command_name + # # Check if the file does NOT exist + # if not os.path.isfile(yaml_file): + # out['fail_reason'] = f"Command File '{yaml_file}' does NOT exist" + # return out # Read the file try: - with open(yaml_file, 'r') as file: - content = file.read() - out['data'] = yaml.safe_load(content) + out['data'] = yaml.safe_load(command_content_content) except Exception as e: - out['fail_reason'] = f"Failed to open the command file '{yaml_file}' > {e}" + out['fail_reason'] = f"Failed to parse the command file content > {e}" return out # Yaml Schema validation diff --git a/kubePtop/dashboard_monitor.py b/kubePtop/dashboard_monitor.py index d57056e..ec3391f 100644 --- a/kubePtop/dashboard_monitor.py +++ b/kubePtop/dashboard_monitor.py @@ -24,14 +24,14 @@ from kubePtop.global_attrs import GlobalAttrs -from kubePtop.node_metrics import PrometheusNodeMetrics +from kubePtop.session import PrometheusAPI from kubePtop.ascii_graph import AsciiGraph from kubePtop.helper import Helper helper_ = Helper() from kubePtop.logging import Logging -class customDashboardMonitoring(PrometheusNodeMetrics): +class customDashboardMonitoring(PrometheusAPI): def __init__(self): super().__init__() self.layout_list = [] diff --git a/kubePtop/dashboard_yaml_loader.py b/kubePtop/dashboard_yaml_loader.py index 229db1e..29f8a15 100644 --- a/kubePtop/dashboard_yaml_loader.py +++ b/kubePtop/dashboard_yaml_loader.py @@ -555,29 +555,31 @@ def validate_dashboard_schema(self, dashboard_yaml_data): exit(1) - def load_dashboard_data(self, dashboard_name): + def load_dashboard_data(self, command_content_content): out = { "success": False, "data": None, "fail_reason": "" } + + # Check if the yaml file exists in the dashboards directory ## If so, return the file path ### The dashboard dir is taken as ENV - yaml_file = dashboard_name - # Check if the file does NOT exist - if not os.path.isfile(yaml_file): - out['fail_reason'] = f"Dashboard File '{yaml_file}' does NOT exist" - return out + # yaml_file = dashboard_name + # # Check if the file does NOT exist + # if not os.path.isfile(yaml_file): + # out['fail_reason'] = f"Dashboard File '{yaml_file}' does NOT exist" + # return out # Read the file try: - with open(yaml_file, 'r') as file: - content = file.read() - out['data'] = yaml.safe_load(content) + # with open(yaml_file, 'r') as file: + # content = file.read() + out['data'] = yaml.safe_load(command_content_content) except Exception as e: - out['fail_reason'] = f"Failed to open the dashboard file '{yaml_file}' > {e}" + out['fail_reason'] = f"Failed to open the dashboard file content > {e}" return out # Yaml Schema validation diff --git a/kubePtop/global_attrs.py b/kubePtop/global_attrs.py index a72e0e4..2f5865c 100644 --- a/kubePtop/global_attrs.py +++ b/kubePtop/global_attrs.py @@ -28,6 +28,9 @@ class GlobalAttrs: start_graphs_with_zero = True graphs_width = 45 + default_dashboards_dir = "/var/kptop/dashboards" + default_commands_dir = "/var/kptop/commands" + debug = False def __init__(self): diff --git a/kubePtop/node_metrics.py b/kubePtop/node_metrics.py deleted file mode 100644 index 79b177d..0000000 --- a/kubePtop/node_metrics.py +++ /dev/null @@ -1,1548 +0,0 @@ -from kubePtop.session import PrometheusAPI -from kubePtop.global_attrs import GlobalAttrs -from kubePtop.logging import Logging -from kubePtop.helper import Helper -from tabulate import tabulate -from kubePtop.colors import Bcolors -import json -import rich -bcolors = Bcolors() -import traceback - - - -helper_ = Helper() - -class PrometheusNodeMetrics(PrometheusAPI): - - def __init__(self): - super().__init__() - - - def nodeMetrics(self, node): - """ - Return Node metrics - INPUT: - - Node name (str) - Return: Node metrics (dct) - """ - output = {} - - output['cpu'] = { - 'cpuLoadAvg1m': self.cpuLoadAvg1m(node), - 'cpuLoadAvg5m': self.cpuLoadAvg5m(node), - 'cpuLoadAvg15m': self.cpuLoadAvg15m(node), - 'cpuUsageAVG': self.cpuUsageAVG(node) - } - - output['memory'] = { - 'MemFreeBytes': self.MemFreeBytes(node), - 'MemAvailableBytes': self.MemAvailableBytes(node), - 'MemTotalBytes': self.MemTotalBytes(node), - 'MemCachedBytes': self.MemCachedBytes(node), - 'MemBuffersBytes': self.MemBuffersBytes(node), - 'MemSwapTotalBytes': self.MemSwapTotalBytes(node), - 'MemSwapFreeBytes': self.MemSwapFreeBytes(node), - 'MemSwapCachedBytes': self.MemSwapCachedBytes(node) - } - - output['disk'] = {} - - output['fs'] = { - 'nodeFsSize': self.nodeFsSize(node), - 'nodeFsUsed': self.nodeFsUsed(node), - 'nodeFsAvailable': self.nodeFsAvailable(node), - } - - - return output - - ### TO BE CLEANED ### - # def node_type(self, node="master"): - # """ - # Detects node type ie. master/worker - # INPUT: - # - K8s node name (str) - # Return: - # - node type eg. "worker" (str) - # """ - # all_nodes = self.list_nodes_names() - # if node not in all_nodes.get('result'): - # raise SystemExit(f"ERROR -- Node '{node}' is not found") - - # role = "worker" - # worker_nodes = self.run_query(f'node_memory_MemTotal_bytes{{node="{node}"}}') - # if len(worker_nodes.get('data').get('result')) < 1: - # role = "master" - - # return role - - - ### NOT USED ### - def list_nodes_names(self, node, devices_filter="eth.*"): - """ - Returns Nodes names: - INPUT: - - K8s node name (str) - Return: - - List of nodes names (lst) - """ - output = { - "success": False, - "fail_reason": "", - "result": [] - } - try: - result = self.run_query('machine_memory_bytes') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - for node in result.get('data').get('result'): - output['result'].append(node.get('metric').get('kubernetes_io_hostname')) - output['success'] = True - - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def MemFreeBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_MemFree_bytes{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def MemAvailableBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_MemAvailable_bytes{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def MemTotalBytes(self, node): - """ - Returns node total memory (worker nodes only) - INPUT: - - K8s node name (str) - Return: - - metric (str) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_MemTotal_bytes{{{GlobalAttrs.node_exporter_node_label}=~"{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def MemCachedBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_Cached_bytes{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def MemBuffersBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_Buffers_bytes{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def MemSwapTotalBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_SwapTotal_bytes{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def MemSwapFreeBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_SwapFree_bytes{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def MemSwapCachedBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_memory_SwapCached_bytes{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def cpuUsageAVG(self, node, avg_time="10m"): - """ - Return cpu info - INPUT: - - k8s node name (str) - Return: - - dct of dcts (nested dct for each core) (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - # Returns cpu usage percentage. - result = self.run_query(f'100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{{mode="idle", {GlobalAttrs.node_exporter_node_label}="{node}"}}[{avg_time}])) * 100 )') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = float(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def cpuLoadAvg1m(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_load1{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = result.get('data').get('result')[0].get('value')[1] - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def cpuLoadAvg5m(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_load5{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = result.get('data').get('result')[0].get('value')[1] - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def cpuLoadAvg15m(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_load15{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = result.get('data').get('result')[0].get('value')[1] - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - ### NOT Used - # def cpuCores(self, node): - # """ - # not in use at the moment - # """ - # output = { - # "success": False, - # "fail_reason": "", - # "result": "" - # } - # try: - # result = self.run_query(f'kube_node_status_capacity_cpu_cores{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - # if not result.get('status') == 'success': - # output['fail_reason'] = "could not get metric value" - # return output - - # if not result.get('data').get('result'): - # output['fail_reason'] = "metric did not return any data" - # return output - - # output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - # output['success'] = True - - # except(KeyError, AttributeError) as e: - # output['success']: False - # output['fail_reason'] = e - # Logging.log.error(e) - # Logging.log.exception(traceback.format_stack()) - # return output - - def cpu_physical_cores(self, node): - """ - not in use at the moment - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'machine_cpu_physical_cores{{kubernetes_io_hostname="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def cpu_sockets(self, node): - """ - not in use at the moment - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'machine_cpu_sockets{{kubernetes_io_hostname="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - ### NOT USED ### - # def nodeOSMetrics(self, node): - # """ - # not in use at the moment - # """ - # output = {} - # output['nodeUp'] = self.nodeUp(node) - # output['bootTimeSeconds'] = self.bootTimeSeconds(node) - # # output['osInfo'] = self.osInfo(node_label, node) - # # output['osVersion'] = self.osVersion(node_label, node) - # output['unameInfo'] = self.unameInfo(node) - # # output['kubeNodeInfo'] = self.kubeNodeInfo(node_label, node) - # output['nodeExporterBuildInfo'] = self.nodeExporterBuildInfo(node) - - # return output - - def nodeUp(self, node): - """ - not in use at the moment - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'up{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - if int(result.get('data').get('result')[0].get('value')[1]) == 1: - output['result'] = True - else: - output['result'] = False - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def bootTimeSeconds(self, node): - """ - not in use at the moment - """ - output = { - "success": False, - "fail_reason": "", - "result": [] - } - try: - result = self.run_query(f'node_boot_time_seconds{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - ### NOT USED ### - # def osInfo(self, node): - # """ - # not in use at the moment - # may be skipped. - # """ - # output = { - # "success": False, - # "fail_reason": "", - # "result": {} - # } - # try: - # result = self.run_query(f'node_os_info{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - # if not result.get('status') == 'success': - # output['fail_reason'] = "could not get metric value" - # return output - - # if not result.get('data').get('result'): - # output['fail_reason'] = "metric did not return any data" - # return output - - # output['result'] = { - # "pretty_name": result.get('data').get('result')[0].get('metric').get('pretty_name'), - # "version": result.get('data').get('result')[0].get('metric').get('version'), - # "version_codename": result.get('data').get('result')[0].get('metric').get('version_codename'), - # "version_id": result.get('data').get('result')[0].get('metric').get('version_id'), - # } - # output['success'] = True - - # except(KeyError, AttributeError) as e: - # output['success']: False - # output['fail_reason'] = e - # Logging.log.error(e) - # Logging.log.exception(traceback.format_stack()) - - # return output - - # def osVersion(self, node): - # """ - # not in use at the moment - # """ - # output = { - # "success": False, - # "fail_reason": "", - # "result": {} - # } - # try: - # result = self.run_query(f'node_os_version{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - # if not result.get('status') == 'success': - # output['fail_reason'] = "could not get metric value" - # return output - - # if not result.get('data').get('result'): - # output['fail_reason'] = "metric did not return any data" - # return output - - # output['result'] = { - # "id": result.get('data').get('result')[0].get('metric').get('id'), - # "id_like": result.get('data').get('result')[0].get('metric').get('id_like'), - # } - # output['success'] = True - - # except(KeyError, AttributeError) as e: - # output['success']: False - # output['fail_reason'] = e - # Logging.log.error(e) - # Logging.log.exception(traceback.format_stack()) - - # return output - - # def unameInfo(self, node): - # """ - # not in use at the moment - # """ - # output = { - # "success": False, - # "fail_reason": "", - # "result": {} - # } - # try: - # result = self.run_query(f'node_uname_info{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - # if not result.get('status') == 'success': - # output['fail_reason'] = "could not get metric value" - # return output - - # if not result.get('data').get('result'): - # output['fail_reason'] = "metric did not return any data" - # return output - - # output['result'] = { - # "sysname": result.get('data').get('result')[0].get('metric').get('sysname'), - # "release": result.get('data').get('result')[0].get('metric').get('release'), - # "nodename": result.get('data').get('result')[0].get('metric').get('nodename'), - # "machine": result.get('data').get('result')[0].get('metric').get('machine'), - # "version": result.get('data').get('result')[0].get('metric').get('version'), - # } - # output['success'] = True - - # except(KeyError, AttributeError) as e: - # output['success']: False - # output['fail_reason'] = e - # Logging.log.error(e) - # Logging.log.exception(traceback.format_stack()) - - # return output - - # def kubeNodeInfo(self, node): - # """ - # not in use at the moment - # """ - # output = { - # "success": False, - # "fail_reason": "", - # "result": {} - # } - # try: - # result = self.run_query(f'kube_node_info{{{GlobalAttrs.node_exporter_node_label}="{node}"}}') - # if not result.get('status') == 'success': - # output['fail_reason'] = "could not get metric value" - # return output - - # if not result.get('data').get('result'): - # output['fail_reason'] = "metric did not return any data" - # return output - - # output['result'] = { - # "container_runtime_version": result.get('data').get('result')[0].get('metric').get('container_runtime_version'), - # "internal_ip": result.get('data').get('result')[0].get('metric').get('internal_ip'), - # "kernel_version": result.get('data').get('result')[0].get('metric').get('kernel_version'), - # "kubelet_version": result.get('data').get('result')[0].get('metric').get('kubelet_version'), - # "kubeproxy_version": result.get('data').get('result')[0].get('metric').get('kubeproxy_version'), - # "os_image": result.get('data').get('result')[0].get('metric').get('os_image'), - # } - # output['success'] = True - - # except(KeyError, AttributeError) as e: - # output['success']: False - # output['fail_reason'] = e - # Logging.log.error(e) - # Logging.log.exception(traceback.format_stack()) - - # return output - - # def nodeExporterBuildInfo(self, node): - # """ - # not in use at the moment - # """ - # output = { - # "success": False, - # "fail_reason": "", - # "result": {} - # } - # try: - # query = f'node_exporter_build_info{{{GlobalAttrs.node_exporter_node_label}="{node}"}}' - # result = self.run_query(query) - # if not result.get('status') == 'success': - # output['fail_reason'] = f"could not get metric value:\n {query}" - # return output - - # if not result.get('data').get('result'): - # output['fail_reason'] = f"Query did not return any data:\n {query}" - # return output - - # output['result'] = { - # "version": result.get('data').get('result')[0].get('metric').get('version'), - # "revision": result.get('data').get('result')[0].get('metric').get('revision'), - # "goversion": result.get('data').get('result')[0].get('metric').get('goversion'), - # } - # output['success'] = True - - # except(KeyError, AttributeError) as e: - # output['success']: False - # output['fail_reason'] = e - # Logging.log.error(e) - # Logging.log.exception(traceback.format_stack()) - - # return output - - def nodeFsSize(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_filesystem_size_bytes{{mountpoint="/",fstype!="rootfs",{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def nodeFsAvailable(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_filesystem_avail_bytes{{mountpoint="/",fstype!="rootfs",{GlobalAttrs.node_exporter_node_label}="{node}"}}') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def nodeFsUsed(self, node): - """ - Returns Node Used Filesystem in bytes - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - result = self.run_query(f'node_filesystem_size_bytes{{mountpoint="/",fstype!="rootfs",{GlobalAttrs.node_exporter_node_label}="{node}"}} - node_filesystem_avail_bytes') - if not result.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def PodMemUsage(self, node, sort_desc=False): - """ - Return Pod memory usage in bytes (running on the node) - INPUT: - - K8s node name (str) - - sort_desc: (bool) sort with Pods memory usage - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - - try: - query = f'sum(container_spec_memory_limit_bytes{{container!="", {GlobalAttrs.kubernetes_exporter_node_label}="{node}"}}) by (pod, instance, namespace)' - memory_limit = self.run_query(query) - # memory_max_usage = self.run_query('sum(container_memory_max_usage_bytes{container!="", instance="ip-192-168-104-139.me-south-1.compute.internal"}) by (pod, instance, namespace)') - # memory_cache = self.run_query('sum(container_memory_cache{container!="", instance="ip-192-168-104-139.me-south-1.compute.internal"}) by (pod, instance, namespace)') - - if sort_desc: - memory_usage = self.run_query(f'sort_desc(sum(container_memory_working_set_bytes{{container!="", {GlobalAttrs.kubernetes_exporter_node_label}="{node}"}}) by (pod, instance, namespace))') - else: - memory_usage = self.run_query(f'sum(container_memory_working_set_bytes{{container!="", {GlobalAttrs.kubernetes_exporter_node_label}="{node}"}}) by (pod, instance, namespace)') - - if not memory_usage.get('status') == 'success': - output['fail_reason'] = f"could not get metric value: \n{query}" - return output - - if not memory_usage.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - dct = {} - if len(memory_usage.get('data').get('result')) > 0 and (len(memory_limit.get('data').get('result'))) > 0: - for pod_mem_usage in memory_usage.get('data').get('result'): - dct[pod_mem_usage.get('metric').get('pod')] = { - "namespace": pod_mem_usage.get('metric').get('namespace'), - "instance": pod_mem_usage.get('metric').get('instance'), - "memory_usage": int(pod_mem_usage.get('value')[1]), - "memory_limit": 0 - } - for pod_mem_limit in memory_limit.get('data').get('result'): - dct[pod_mem_limit.get('metric').get('pod')]["memory_limit"] = int(pod_mem_limit.get('value')[1]) - - output['result'] = dct - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def PodCpuUsageAvg(self, node, avg="10m"): - """ - Returns Pod CPU usgae average - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - pods_cpu_avg = self.run_query(f'sum(irate(container_cpu_usage_seconds_total{{pod!="", {GlobalAttrs.kubernetes_exporter_node_label}="{node}"}}[{avg}])) by (pod, namespace, instance)') - node_cores = self.run_query(f'sum (machine_cpu_cores{{instance="{GlobalAttrs.kubernetes_exporter_node_label}"}})') - - if not pods_cpu_avg.get('status') == 'success': - output['fail_reason'] = "could not get metric value" - return output - - if not pods_cpu_avg.get('data').get('result'): - output['fail_reason'] = "metric did not return any data" - return output - - dct = {} - if len(pods_cpu_avg.get('data').get('result')) > 0: - cpu_cores_n = node_cores.get('data').get('result')[0].get('value')[1] - for pod in pods_cpu_avg.get('data').get('result'): - dct[pod.get('metric').get('pod')] = { - "node": pod.get('metric').get('instance'), - "namespace": pod.get('metric').get('namespace'), - "namespace": pod.get('metric').get('namespace'), - "cpu_usage_avg": (float(pod.get('value')[1])) // int(cpu_cores_n) * 100, - } - - output['success'] = True - output['result'] = dct - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def PodMemTopUsage(self, node): - """ - Print a table with Top Pods with memory usage (default: top 10 pods) - INPUT: - - K8s node name (str) - OUTPUT: prints a text table. - RETURN: No Return - """ - table = [['POD', "NAMESPACE", 'MEMORY_USAGE', 'PERCENTAGE', 'MEMORY_LIMIT']] - - pods_metircs = self.PodMemUsage(node, sort_desc=True) - - if not pods_metircs.get('success'): - return pods_metircs.get('fail_reason') - - for pod, metrics in pods_metircs.get('result').items(): - if metrics.get('memory_limit') == 0: - memory_limit = "--" - else: - memory_limit = helper_.bytes_to_kb_mb_gb(metrics.get('memory_limit')) - - if metrics.get('memory_limit') != 0: - memory_usage_percentage = str(int(100 * (metrics.get('memory_usage') / metrics.get('memory_limit')))) + "%" - else: - memory_usage_percentage = "--" - - row = [pod, metrics.get('namespace'), helper_.bytes_to_kb_mb_gb(metrics.get('memory_usage')), memory_usage_percentage, memory_limit] - table.append(row) - - out = tabulate(table, headers='firstrow', tablefmt='plain', showindex=True) - return out - - def nodeNetworkReceiveBytes(self, node, devices_to_ingore="tap.*|veth.*|br.*|docker.*|virbr*|lo*|eni.*"): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(node_network_receive_bytes_total{{{GlobalAttrs.node_exporter_node_label}=~"{node}",device!~"{devices_to_ingore}"}}[10m])) by ({GlobalAttrs.node_exporter_node_label}, device)' - result = self.run_query(query) - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - devices = {} - for device in result.get('data').get('result'): - devices[device.get('metric').get('device')] = float(device.get('value')[1]) - - output['result'] = devices - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def nodeNetworkTransmitBytes(self, node, devices_to_ingore="tap.*|veth.*|br.*|docker.*|virbr*|lo*|eni.*"): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(node_network_transmit_bytes_total{{{GlobalAttrs.node_exporter_node_label}=~"{node}",device!~"{devices_to_ingore}"}}[10m])) by ({GlobalAttrs.node_exporter_node_label}, device)' - result = self.run_query(query) - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - devices = {} - for device in result.get('data').get('result'): - devices[device.get('metric').get('device')] = float(device.get('value')[1]) - - output['result'] = devices - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - - def nodeDiskWrittenBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(node_disk_written_bytes_total{{{GlobalAttrs.node_exporter_node_label}=~"{node}"}}[10m])) by ({GlobalAttrs.node_exporter_node_label}, device)' - result = self.run_query(query) - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - devices = {} - for device in result.get('data').get('result'): - devices[device.get('metric').get('device')] = float(device.get('value')[1]) - - output['result'] = devices - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - - def nodeDiskReadBytes(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(node_disk_read_bytes_total{{{GlobalAttrs.node_exporter_node_label}=~"{node}"}}[10m])) by ({GlobalAttrs.node_exporter_node_label}, device)' - result = self.run_query(query) - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - devices = {} - for device in result.get('data').get('result'): - devices[device.get('metric').get('device')] = float(device.get('value')[1]) - - output['result'] = devices - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - ### NOT USED ### - # def nodeCheck(self, node): - # """ - # Check if the node is available - # INPUT: - # - K8s node name (str) - # Return: - # - metric (dct) - # """ - # output = { - # "success": False, - # "fail_reason": "", - # "result": "" - # } - # try: - # query = f'sum(irate(node_disk_read_bytes_total{{{GlobalAttrs.node_exporter_node_label}=~"{node}"}}[10m])) by ({GlobalAttrs.node_exporter_node_label}, device)' - # result = self.run_query(query) - # if not result.get('status') == 'success': - # output['fail_reason'] = f"could not get metric's value: \n{query}" - # return output - - # if not result.get('data').get('result'): - # output['fail_reason'] = f"Query did not return any data: \n{query}" - # return output - - # devices = {} - # for device in result.get('data').get('result'): - # devices[device.get('metric').get('device')] = float(device.get('value')[1]) - - # output['result'] = devices - # output['success'] = True - - # except(KeyError, AttributeError) as e: - # output['success']: False - # output['fail_reason'] = e - # Logging.log.error(e) - # Logging.log.exception(traceback.format_stack()) - - # return output - - - def topNode(self, node=".*", option=""): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - memory_total_query = f'node_memory_MemTotal_bytes{{{GlobalAttrs.node_exporter_node_label}=~"{node}"}}' - memory_total = self.run_query(memory_total_query) - if not memory_total.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {memory_total_query}" - return output - if not memory_total.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {memory_total_query}" - return output - - nodes_dct = {} - - for node_ in memory_total.get('data').get('result'): - - if (option == 'cloud') or (option == 'json'): - nodes_dct[node_.get('metric').get(GlobalAttrs.node_exporter_node_label)] = { - "memory_total": int(node_.get('value')[1]), - "memory_free": -1, - "memory_used": -1, - "cpu_cores": -1, - # "cpu_used": -1, # not sure of the metrics to get the used cpu in milicores. - "cpu_used_percentage": -1, - "running_pods_num": -1, - "cluster": "?", - "node_os": "?", - "node_arch": "?", - "region": "?", - "az": "?", - "instance_type": "?", - "cluster_env": "Unknown", - "node_group_capacity_type": "?", - "node_group_name": "?", - } - else: - nodes_dct[node_.get('metric').get(GlobalAttrs.node_exporter_node_label)] = { - "memory_total": int(node_.get('value')[1]), - "memory_free": -1, - "memory_used": -1, - "cpu_cores": -1, - # "cpu_used": -1, # not sure of the metrics to get the used cpu in milicores. - "cpu_used_percentage": -1, - "running_pods_num": -1, - } - - memory_free_query = f'node_memory_MemFree_bytes{{{GlobalAttrs.node_exporter_node_label}=~"{node}"}}' - memory_free = self.run_query(memory_free_query) - if not memory_free.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {memory_free_query}" - return output - if not memory_free.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {memory_free_query}" - return output - - cpu_cores_query = f'machine_cpu_cores{{kubernetes_io_hostname=~"{node}"}}' - cpu_cores = self.run_query(cpu_cores_query) - if not cpu_cores.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {cpu_cores_query}" - return output - if not cpu_cores.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {cpu_cores_query}" - return output - - #### Fix - cpu_used_percentage_query = f'100 - (avg by ({GlobalAttrs.node_exporter_node_label}) (rate(node_cpu_seconds_total{{mode="idle", {GlobalAttrs.node_exporter_node_label}=~"{node}"}}[10m])) * 100)' - cpu_used_percentage = self.run_query(cpu_used_percentage_query) - if not cpu_used_percentage.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {cpu_used_percentage_query}" - return output - if not cpu_used_percentage.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {cpu_used_percentage_query}" - return output - - running_pods_count_query = f'kubelet_running_pods{{instance=~"{node}"}}' - running_pods_count = self.run_query(running_pods_count_query) - if not running_pods_count.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {running_pods_count_query}" - return output - if not running_pods_count.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {running_pods_count_query}" - return output - - ## - if (option == 'cloud') or (option == 'json'): - node_managed_k8s_info = self.nodeManagedK8sInfo(node=node) - if not node_managed_k8s_info.get('success'): - output['fail_reason'] = node_managed_k8s_info.get('fail_reason') - return output - - for node in memory_free.get('data').get('result'): - nodes_dct[node.get('metric').get(GlobalAttrs.node_exporter_node_label)]['memory_free'] = int(node.get('value')[1]) - nodes_dct[node.get('metric').get(GlobalAttrs.node_exporter_node_label)]['memory_used'] = nodes_dct[node.get('metric').get(GlobalAttrs.node_exporter_node_label)]['memory_total'] - int(node.get('value')[1]) - - for node in cpu_cores.get('data').get('result'): - try: - nodes_dct[node.get('metric').get('instance')]['cpu_cores'] = int(node.get('value')[1]) - except KeyError: - pass # A KeyError Exception is expected as this metric returns the value for the master nodes while other metrics dont. - - for node in cpu_used_percentage.get('data').get('result'): - nodes_dct[node.get('metric').get(GlobalAttrs.node_exporter_node_label)]['cpu_used_percentage'] = float(node.get('value')[1]) - - for node in running_pods_count.get('data').get('result'): - try: - nodes_dct[node.get('metric').get('instance')]['running_pods_num'] = int(node.get('value')[1]) - except KeyError: - pass # A KeyError Exception is expected as this metric returns the value for the master nodes while other metrics dont. - - # rich.print(node_managed_k8s_info) - if (option == 'cloud') or (option == 'json'): - for node in node_managed_k8s_info.get('result'): - # General Labels (match different cloud providers) - # rich.print(node_managed_k8s_info.get('result')) - try: - nodes_dct[node.get('metric').get('instance')]['node_arch'] = node['metric']['beta_kubernetes_io_arch'] - nodes_dct[node.get('metric').get('instance')]['node_os'] = node['metric']['beta_kubernetes_io_os'] - nodes_dct[node.get('metric').get('instance')]['region'] = node['metric']['topology_kubernetes_io_region'] - nodes_dct[node.get('metric').get('instance')]['az'] = node['metric']['topology_kubernetes_io_zone'] - nodes_dct[node.get('metric').get('instance')]['instance_type'] = node['metric']['node_kubernetes_io_instance_type'] - nodes_dct[node.get('metric').get('instance')]['node_group_name'] = node['metric']['eks_amazonaws_com_nodegroup'] - - except KeyError: - pass # If labels are not found, means that most probably this is a Local cluster - - try: - nodes_dct[node.get('metric').get('instance')]['cluster'] = node['metric']['cluster'] - except: - pass # If labels are not found, means that most probably this is a Local cluster - - # AWS Labels - try: - nodes_dct[node.get('metric').get('instance')]['node_group_capacity_type'] = node['metric']['eks_amazonaws_com_capacityType'] - nodes_dct[node.get('metric').get('instance')]['node_group_name'] = node['metric']['eks_amazonaws_com_nodegroup'] - if nodes_dct[node.get('metric').get('instance')]['node_group_name']: - nodes_dct[node.get('metric').get('instance')]['cluster_env'] = 'EKS' - except KeyError: - pass # If labels are not found, means that it's not an EKS cluster. - - output['result'] = nodes_dct - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def topNodeJson(self, node=".*", color=False): - nodes_dct = self.topNode(node=node, option='json') - if not nodes_dct.get('success'): - print(f"ERROR -- Failed to get nodes \n{nodes_dct.get('fail_reason')}") - exit(1) - - if color: - rich.print_json(data=nodes_dct.get('result')) - else: - print(json.dumps(nodes_dct.get('result'), indent=4)) - - - def topNodeTable(self, option=""): - """ - """ - nodes_json = self.topNode(option=option) - # import rich - # rich.print(nodes_json) - if not nodes_json.get('success'): - print(f"No nodes found \n{bcolors.WARNING + str(nodes_json.get('fail_reason')) + bcolors.ENDC}") - exit(1) - - - table = [['NODE', 'MEM TOTAL', 'MEM USAGE', 'MEM FREE', 'CPU CORES', 'CPU USAGE%', 'RUNNING PODS' ]] - if option == 'cloud': - table = [['NODE', 'MEM TOTAL', 'MEM USAGE', 'MEM FREE', 'CPU CORES', 'CPU USAGE%', 'RUNNING PODS', 'CLUSTER', 'INSTANCE TYPE', 'AZ', 'ENV', 'NG CAPACITY TYPE', 'NG']] - - if option == 'cloud': - for node, value in nodes_json.get('result').items(): - row = [ - node, - helper_.bytes_to_kb_mb_gb(value.get('memory_total')), - helper_.bytes_to_kb_mb_gb(value.get('memory_used')), - helper_.bytes_to_kb_mb_gb(value.get('memory_free')), - value.get('cpu_cores'), - str(round(value.get('cpu_used_percentage'))) + "%", - value.get('running_pods_num'), - value.get('cluster'), - value.get('instance_type'), - # value.get('region'), - value.get('az'), - value.get('cluster_env'), - value.get('node_group_capacity_type'), - value.get('node_group_name'), - ] - table.append(row) - else: - for node, value in nodes_json.get('result').items(): - row = [ - node, - helper_.bytes_to_kb_mb_gb(value.get('memory_total')), - helper_.bytes_to_kb_mb_gb(value.get('memory_used')), - helper_.bytes_to_kb_mb_gb(value.get('memory_free')), - value.get('cpu_cores'), - str(round(value.get('cpu_used_percentage'))) + "%", - value.get('running_pods_num'), - ] - table.append(row) - - out = tabulate(table, headers='firstrow', tablefmt='plain', showindex=False) - print(out) - - - def nodeManagedK8sInfo(self, node): - """ - INPUT: - - K8s node name (str) - Return: - - dct of metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - query = f'kubelet_node_name{{kubernetes_io_hostname=~"{node}"}}' # 'machine_cpu_cores' also has the needed labels - result = self.run_query(query) - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - output['result'] = result.get('data').get('result') - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - - - - diff --git a/kubePtop/node_monitor.py b/kubePtop/node_monitor.py deleted file mode 100644 index 8c76579..0000000 --- a/kubePtop/node_monitor.py +++ /dev/null @@ -1,921 +0,0 @@ -import time -from tabulate import tabulate -# import textwrap -from datetime import datetime, timezone -import threading -import rich -from rich.live import Live -from rich.table import Table -from rich.panel import Panel -from rich.progress import SpinnerColumn, Progress, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn, TimeElapsedColumn -from rich.layout import Layout -from rich.console import Console, Group -from rich.rule import Rule -from rich.console import Console -from rich.markdown import Markdown -from rich.text import Text -import traceback - - -from kubePtop.global_attrs import GlobalAttrs -from kubePtop.node_metrics import PrometheusNodeMetrics -from kubePtop.ascii_graph import AsciiGraph -from kubePtop.helper import Helper -helper_ = Helper() - -from kubePtop.logging import Logging - -class Node_Monitoring(PrometheusNodeMetrics): - def __init__(self): - super().__init__() - self.dashboards = ['default', 'pvc'] - - - - def list_dashboards(self): - # Print it with tabulate table. - print(self.dashboards) - - def display_dashboard(self, dashboard, node_name): - if dashboard not in self.dashboards: - print(f"ERROR -- Dashboard '{dashboard}' not found") - Logging.log.error(f"ERROR -- Dashboard '{dashboard}' not found") - print("Available dashboards:") - print(self.list_dashboards()) - - if dashboard == 'default': - self.node_monitor_dashboard_default(node_name) - if dashboard == 'pvc': - self.node_monitor_dashboard_pvc(node_name) - - - - def node_monitor_dashboard_default(self, node_name): - # Print loading because the layout may take few seconds to start (Probably due to slow connection) - rich.print("[blink]Loading ...", end="\r") - - def make_layout() -> Layout: - """ - The layout structure - """ - layout = Layout(name="root") - - layout.split( - Layout(name="header", size=3), - # Layout(name="header2", size=7, ratio=1), - Layout(name="main", ratio=1), - # Layout(name="footer", size=6, ratio=1) - ) - layout["main"].split_row( - # Layout(name="side",), - Layout(name="body", ratio=3, minimum_size=100,), - ) - # layout["side"].split(Layout(name="box1")) # , Layout(name="box2") - # layout["body"].split(Layout(name="head", size=5, ratio=2), Layout(name="body1")) # , Layout(name="box2") - layout["body"].split_row(Layout(name="body1", size=45), Layout(name="body2"),) # , Layout(name="box2") - layout['body1'].split_column(Layout(name="body1_a"), Layout(name="body1_b", size=11)) - layout["body2"].split(Layout(name="body2_a", ratio=1), Layout(name="body2_b", ratio=1)) # , Layout(name="box2") - layout['body2_b'].split_row(Layout(name="body2_b_a", ratio=1), Layout(name="body2_b_b", ratio=1)) - - return layout - - class Header(): - """ - Display header with clock. - """ - def __rich__(self) -> Panel: - grid = Table.grid(expand=True) - grid.add_column(justify="center", ratio=1) - grid.add_column(justify="right") - grid.add_row( - f"[b]Node: [/b] {node_name} ", - datetime.now().ctime().replace(":", "[blink]:[/]"), - ) - return Panel(grid, style="green") - - class Node_Resources_Progress(PrometheusNodeMetrics): - def __init__(self): - super().__init__() - self.progress_start() - - def progress_start(self): - # node_metrics_json = self.nodeMetrics(node=node_name) - # node_mem_metrics_json = node_metrics_json.get('memory') - # node_cpu_metrics_json = node_metrics_json.get('cpu') - # node_fs_metrics_json = node_metrics_json.get('fs') - - - self.progress_threads_status = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TextColumn("[progress.percentage]{task.percentage:>3.0f}"), - TextColumn("{task.fields[status]}"), - ) - self.task_thread_refresh = self.progress_threads_status.add_task(description=f"[white]Interval Refresh", status=f"unknown") - self.task_prometheus_server_connection = self.progress_threads_status.add_task(description=f"[white]Prometheus", status=f"unknown") - - self.progress_mem_total = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TextColumn("[progress.percentage]{task.percentage:>3.0f}"), - TextColumn("{task.fields[status]}"), - ) - # if node_mem_metrics_json.get('MemTotalBytes').get('success'): - self.task_mem_total = self.progress_mem_total.add_task(description=f"[white]Mem Total ", status="Loading") - - self.progress_mem = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - - # if (node_mem_metrics_json.get('MemTotalBytes').get('success') and node_mem_metrics_json.get('MemAvailableBytes').get('success')): - self.task_mem_used = self.progress_mem.add_task(completed=0, description=f"[white]Mem used", total=100, status="Loading") - # if node_mem_metrics_json.get('MemAvailableBytes').get('success'): - # self.task_mem_available = self.progress_mem.add_task(completed=0, description=f"[white]Mem available", total=100, status="Loading") - # if node_mem_metrics_json.get('MemFreeBytes').get('success'): - self.task_mem_free = self.progress_mem.add_task(completed=0, description=f"[white]Mem free", total=100, status="Loading") - # if node_mem_metrics_json.get('MemCachedBytes').get('success'): - self.task_mem_cached = self.progress_mem.add_task(completed=0, description=f"[white]Mem cached ", total=100, status="Loading") - # if node_mem_metrics_json.get('MemBuffersBytes').get('success'): - self.task_mem_buffer = self.progress_mem.add_task(completed=0, description=f"[white]Mem buffer ", total=100, status="Loading") - - self.progress_swap = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_mem_metrics_json.get('MemSwapTotalBytes').get('success'): - self.task_swap_total = self.progress_swap.add_task(completed=0, description=f"[white]Swap Total ", total=100, status="Loading") - # if node_mem_metrics_json.get('MemSwapTotalBytes').get('success'): - self.task_swap_free = self.progress_swap.add_task(completed=0, description=f"[white]Swap free ", total=100, status="Loading") - # if node_mem_metrics_json.get('MemSwapCachedBytes').get('success'): - self.task_swap_cached = self.progress_swap.add_task(completed=0, description=f"[white]Swap cached ", total=100, status="Loading") - - self.progress_cpu_used_avg = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_cpu_metrics_json.get('cpuUsageAVG').get('success'): - self.task_cpu_used_avg = self.progress_cpu_used_avg.add_task(description="CPU used AVG[10m]", completed=0, total=100, status="Loading") - - self.progress_cpu = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_cpu_metrics_json.get('cpuLoadAvg1m').get('success'): - self.task_cpu_load1avg = self.progress_cpu.add_task(description=f"[white]CPU load avg 1m ", status="Loading") - self.task_cpu_load5avg = self.progress_cpu.add_task(description=f"[white]CPU load avg 5m ", status="Loading") - self.task_cpu_load15avg = self.progress_cpu.add_task(description=f"[white]CPU load avg 15m ", status="Loading") - - - self.progress_fs_total = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_fs_metrics_json.get('nodeFsSize').get('success'): - self.task_fs_size_total = self.progress_fs_total.add_task(description=f"[white]FS Total ", status="Loading") - - self.progress_fs = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_fs_metrics_json.get('nodeFsUsed').get('success'): - self.task_fs_used = self.progress_fs.add_task(completed=0, description=f"[white]FS used ", total=100, status="Loading") - - # if node_fs_metrics_json.get('nodeFsAvailable').get('success'): - self.task_fs_available = self.progress_fs.add_task(completed=0, description=f"[white]FS available ", total=100, status="Loading") - - - - self.group_memory = Group ( - self.progress_mem_total, - self.progress_mem, - Rule(style='#AAAAAA'), - self.progress_swap, - ) - - self.group_cpu = Group ( - self.progress_cpu_used_avg, - self.progress_cpu - ) - - self.group_fs = Group ( - self.progress_fs_total, - self.progress_fs - ) - - def update(self): - time.sleep(3) - while True: - Logging.log.info("Getting node metrics to update the dashboard") - node_metrics_json = self.nodeMetrics(node=node_name) - Logging.log.debug("Node metrics Json:") - Logging.log.debug(node_metrics_json) - node_mem_metrics_json = node_metrics_json.get('memory') - node_cpu_metrics_json = node_metrics_json.get('cpu') - node_fs_metrics_json = node_metrics_json.get('fs') - - self.progress_mem_total.update(self.task_mem_total, description=f"[white]Mem Total ", status=f" {helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemTotalBytes').get('result'))}") - self.progress_mem.update(self.task_mem_used, completed=node_mem_metrics_json.get('MemTotalBytes').get('result') - (node_mem_metrics_json.get('MemFreeBytes').get('result') + node_mem_metrics_json.get('MemBuffersBytes').get('result') + node_mem_metrics_json.get('MemCachedBytes').get('result')), description=f"[white]Mem used", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemTotalBytes').get('result') - (node_mem_metrics_json.get('MemFreeBytes').get('result') + node_mem_metrics_json.get('MemBuffersBytes').get('result') + node_mem_metrics_json.get('MemCachedBytes').get('result')))}") - # self.progress_mem.update(self.task_mem_available, completed=node_mem_metrics_json.get('MemAvailableBytes').get('result'), description=f"[white]Mem available", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemAvailableBytes').get('result'))}") - self.progress_mem.update(self.task_mem_free, completed=node_mem_metrics_json.get('MemFreeBytes').get('result'), description=f"[white]Mem free", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemFreeBytes').get('result'))}") - self.progress_mem.update(self.task_mem_cached, completed=node_mem_metrics_json.get('MemCachedBytes').get('result'), description=f"[white]Mem cached ", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemCachedBytes').get('result'))}") - self.progress_mem.update(self.task_mem_buffer, completed=node_mem_metrics_json.get('MemBuffersBytes').get('result'), description=f"[white]Mem buffer ", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemBuffersBytes').get('result'))}") - - self.progress_swap.update(self.task_swap_total, completed=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), description=f"[white]Swap Total ", total=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemSwapTotalBytes').get('result'))}") - self.progress_swap.update(self.task_swap_free, completed=node_mem_metrics_json.get('MemSwapFreeBytes').get('result'), description=f"[white]Swap free ", total=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemSwapFreeBytes').get('result'))}") - self.progress_swap.update(self.task_swap_cached, completed=node_mem_metrics_json.get('MemSwapCachedBytes').get('result'), description=f"[white]Swap cached ", total=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemSwapCachedBytes').get('result'))}") - - self.progress_cpu_used_avg.update(self.task_cpu_used_avg, completed=(node_cpu_metrics_json.get('cpuUsageAVG').get('result') / 2), description=f"[white]CPU used AVG[10m] ", total=100, status="") - self.progress_cpu.update(self.task_cpu_load1avg, description=f"[white]CPU load avg 1m ", status=node_cpu_metrics_json.get('cpuLoadAvg1m').get('result')) - self.progress_cpu.update(self.task_cpu_load5avg, description=f"[white]CPU load avg 5m ", status=node_cpu_metrics_json.get('cpuLoadAvg5m').get('result')) - self.progress_cpu.update(self.task_cpu_load15avg, description=f"[white]CPU load avg 15m ", status=node_cpu_metrics_json.get('cpuLoadAvg15m').get('result')) - - self.progress_fs_total.update(self.task_fs_size_total, description=f"[white]FS Total ", status=helper_.bytes_to_kb_mb_gb(node_fs_metrics_json.get('nodeFsSize').get('result'))) - self.progress_fs.update(self.task_fs_used, completed=node_fs_metrics_json.get('nodeFsUsed').get('result'), description=f"[white]FS used ", total=node_fs_metrics_json.get('nodeFsSize').get('result'), status=helper_.bytes_to_kb_mb_gb(node_fs_metrics_json.get('nodeFsUsed').get('result'))) - self.progress_fs.update(self.task_fs_available, completed=node_fs_metrics_json.get('nodeFsAvailable').get('result'), description=f"[white]FS available ", total=node_fs_metrics_json.get('nodeFsSize').get('result'), status=helper_.bytes_to_kb_mb_gb(node_fs_metrics_json.get('nodeFsAvailable').get('result'))) - - if GlobalAttrs.debug: - Logging.log.debug(f"Waiting for interval '{GlobalAttrs.live_update_interval}' before the next update") - time.sleep(GlobalAttrs.live_update_interval) - - def check_thread_node_resources(self, restart=True): - while True: - def thread_status(): - status = "" - if self.thread_node_resources.is_alive(): - status = f"alive [green]✔️" - else: - status = "dead [red]❌" - if restart: - # Restart thread - self.start_threads() - return status - - self.progress_threads_status.update(task_id=self.task_thread_refresh, status=thread_status()) - time.sleep(5) - - class ValidatePrometheuesConnection(PrometheusNodeMetrics): - def __init__(self): - super().__init__() - self.result = {} - - def run(self): - while True: - time.sleep(5) - self.result = self.verify_prometheus_connection() - if GlobalAttrs.debug: - print("DEBUG -- Function: ValidatePrometheuesConnection") - Logging.log.info("Function: ValidatePrometheuesConnection") - Logging.log.info("Function: ValidatePrometheuesConnection, waiting for internal '5s' ") - - def check_thread_prometheus_server_connection(self): - while True: - - def thread_status(): - result = self.vlaidate_prometheus_server.result - # if self.thread_check_thread_prometheus_server_connection.is_alive(): - if result.get('connected') is None: - status = f"waiting [green]✔️" - elif result.get('connected'): - status = f"connected [green]✔️" - else: - status = f"{result.get('reason')} [red]❌" - - return status - - self.progress_threads_status.update(task_id=self.task_prometheus_server_connection, status=f"{thread_status()} ({self.vlaidate_prometheus_server.result.get('status_code')})") - time.sleep(5) - - def start_threads(self): - self.thread_node_resources = threading.Thread(target=self.update) - self.thread_node_resources.daemon = True - self.thread_node_resources.start() - Logging.log.debug("Started Thread: thread_node_resources") - - self.vlaidate_prometheus_server = self.ValidatePrometheuesConnection() - self.thread_prometheus_server_connection = threading.Thread(target=self.vlaidate_prometheus_server.run) - self.thread_prometheus_server_connection.daemon = True - self.thread_prometheus_server_connection.start() - Logging.log.debug("Started Thread: thread_prometheus_server_connection") - - def watch_threads(self): - self.thread_check_thread_node_resources = threading.Thread(target=self.check_thread_node_resources) - self.thread_check_thread_node_resources.daemon = True - self.thread_check_thread_node_resources.start() - - self.thread_check_thread_prometheus_server_connection = threading.Thread(target=self.check_thread_prometheus_server_connection) - self.thread_check_thread_prometheus_server_connection.daemon = True - self.thread_check_thread_prometheus_server_connection.start() - - - try: - node_metrics = PrometheusNodeMetrics() - node_resources_progress = Node_Resources_Progress() - - progress_table = Table.grid(expand=True) - progress_table.add_row( - Panel(node_resources_progress.group_cpu, title="[b]CPU", padding=(1, 2)), - ) - progress_table.add_row( - Panel(node_resources_progress.group_memory, title="[b]Memory", padding=(1, 2)), - ) - progress_table.add_row( - Panel(node_resources_progress.group_fs, title='[b]FS "/"', padding=(1, 2)), - ) - progress_table.add_row( - Panel(node_resources_progress.progress_threads_status, title="[b]Threads Status",padding=(1, 2), subtitle=""), - ) - - - layout = make_layout() - layout["header"].update(Header()) - layout["body1_a"].update(progress_table) - layout['body1_b'].update(Panel("Made with [red]❤️[/red]", title='[b]Unused Space', padding=(1, 2),)) - - - layout["body2_a"].update(Panel("Loading ...", title="[b]Top Pods in Memory Usage", padding=(1, 1))) - - node_resources_progress.start_threads() - node_resources_progress.watch_threads() - - update_disk_read_bytes_graph = True - disk_read_bytes_graph = AsciiGraph() - disk_read_bytes = self.nodeDiskReadBytes(node_name) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'disk_read_bytes' metrics; Result:\n{disk_read_bytes}") - else: - Logging.log.info("Getting Pod 'disk_read_bytes' metrics") - if disk_read_bytes.get('success'): - disk_read_bytes_graph.create_graph(disk_read_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - disk_read_bytes_graph.graph = disk_read_bytes.get('fail_reason') - update_disk_read_bytes_graph = False - - update_network_received_bytes_graph = True - network_received_bytes_graph = AsciiGraph() - network_received_bytes = self.nodeNetworkReceiveBytes(node_name) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'network_received_bytes' metrics; Result:\n{network_received_bytes}") - else: - Logging.log.info("Getting Pod 'network_received_bytes' metrics") - if network_received_bytes.get('success'): - network_received_bytes_graph.create_graph(network_received_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - network_received_bytes_graph.graph = network_received_bytes.get('fail_reason') - update_network_received_bytes_graph = False - - update_network_transmit_bytes_graph = True - network_transmit_bytes_graph = AsciiGraph() - network_transmit_bytes = self.nodeNetworkTransmitBytes(node_name) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'network_transmit_bytes' metrics; Result:\n{network_transmit_bytes}") - else: - Logging.log.info("Getting Pod 'network_transmit_bytes' metrics") - if network_transmit_bytes.get('success'): - network_transmit_bytes_graph.create_graph(network_transmit_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - network_transmit_bytes_graph.graph = network_transmit_bytes.get('fail_reason') - update_network_transmit_bytes_graph = False - - - - update_disk_written_bytes_graph = True - disk_written_bytes_graph = AsciiGraph() - disk_written_bytes = self.nodeDiskWrittenBytes(node_name) - if disk_written_bytes.get('success'): - disk_written_bytes_graph.create_graph(disk_written_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - disk_written_bytes_graph.graph = disk_written_bytes.get('fail_reason') - update_disk_written_bytes_graph = False - - layout["body2_b_b"].update(Panel(Markdown("Loading ..."), title="[b]Network IO", padding=(1, 1))) - layout["body2_b_a"].update(Panel(Markdown("Loading ..."), title="[b]Disk IO", padding=(1, 1))) - - group_network_io = Group( - Markdown("Bytes Received", justify='center'), - Text.from_ansi(network_received_bytes_graph.graph + f"\n {network_received_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Transmitted", justify='center'), - Text.from_ansi(network_transmit_bytes_graph.graph + f"\n {network_transmit_bytes_graph.colors_description_str}") - ) - - group_disk_io = Group( - Markdown("Bytes Read", justify='center'), - Text.from_ansi(disk_read_bytes_graph.graph + f"\n {disk_read_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Written", justify='center'), - Text.from_ansi(disk_written_bytes_graph.graph + f"\n {disk_written_bytes_graph.colors_description_str}") - ) - - Logging.log.info("Starting the Layout.") - with Live(layout, auto_refresh=True, screen=True, refresh_per_second=GlobalAttrs.live_update_interval): - while True: - pod_memory_usage = node_metrics.PodMemTopUsage(node=node_name) - layout["body2_a"].update(Panel(pod_memory_usage, title="[b]Top Pods in Memory Usage", padding=(1, 1))) - Logging.log.info("Updating the Layout with 'Top Pods in Memory Usage'") - Logging.log.debug(f"Result:\n{pod_memory_usage}") - - if update_network_received_bytes_graph: - network_received_bytes = self.nodeNetworkReceiveBytes(node_name) - Logging.log.info("Updating Node 'network_received_bytes' metrics") - Logging.log.debug(network_received_bytes) - for device, value in network_received_bytes.get('result').items(): - network_received_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - if update_network_transmit_bytes_graph: - Logging.log.info("Updating Node 'network_transmit_bytes' metrics") - Logging.log.debug(network_transmit_bytes) - network_transmit_bytes = self.nodeNetworkTransmitBytes(node_name) - for device, value in network_transmit_bytes.get('result').items(): - network_transmit_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - if update_disk_read_bytes_graph: - disk_read_bytes = self.nodeDiskReadBytes(node_name) - Logging.log.info("Updating Node 'disk_read_bytes' metrics") - Logging.log.debug(disk_read_bytes) - for device, value in disk_read_bytes.get('result').items(): - disk_read_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - if update_disk_written_bytes_graph: - disk_written_bytes = self.nodeDiskWrittenBytes(node_name) - Logging.log.info("Updating Node 'disk_written_bytes' metrics") - Logging.log.debug(disk_written_bytes) - for device, value in disk_written_bytes.get('result').items(): - disk_written_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - if update_network_received_bytes_graph or update_network_transmit_bytes_graph: - group_network_io = Group( - Markdown("Bytes Received", justify='center'), - Text.from_ansi(network_received_bytes_graph.graph + f"\n {network_received_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Transmitted", justify='center'), - Text.from_ansi(network_transmit_bytes_graph.graph + f"\n {network_transmit_bytes_graph.colors_description_str}") - ) - - if update_disk_read_bytes_graph or update_disk_written_bytes_graph: - group_disk_io = Group( - Markdown("Bytes Read", justify='center'), - Text.from_ansi(disk_read_bytes_graph.graph + f"\n {disk_read_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Written", justify='center'), - Text.from_ansi(disk_written_bytes_graph.graph + f"\n {disk_written_bytes_graph.colors_description_str}") - ) - - layout["body2_b_b"].update(Panel(group_network_io, title="[b]Network IO", padding=(1, 1))) - layout["body2_b_a"].update(Panel(group_disk_io, title="[b]Disk IO", padding=(1, 1))) - - Logging.log.info(f"waiting for the update interval '{GlobalAttrs.live_update_interval}' before updating the Layout ") - time.sleep(GlobalAttrs.live_update_interval) - Logging.log.info(f"Updating the layout") - - except Exception as e: - rich.print("\n[yellow]ERROR -- " + str(e)) - rich.print("\n[underline bold]Exception:") - traceback.print_exc() - exit(1) - except KeyboardInterrupt: - print(" ", end="\r") - rich.print("Ok") - exit(0) - - - def node_monitor_dashboard_pvc(self, node_name): - # Print loading because the layout may take few seconds to start (Probably due to slow connection) - rich.print("[blink]Loading ...", end="\r") - - def make_layout() -> Layout: - """ - The layout structure - """ - layout = Layout(name="root") - - layout.split( - Layout(name="header", size=3), - Layout(name="main", ratio=1), - ) - layout["main"].split_row( - Layout(name="body", ratio=3, minimum_size=100,), - ) - - layout["body"].split_column(Layout(name="body1", size=23), Layout(name="body2"),) # , Layout(name="box2") - return layout - - class Header(): - """ - Display header with clock. - """ - def __rich__(self) -> Panel: - grid = Table.grid(expand=True) - grid.add_column(justify="center", ratio=1) - grid.add_column(justify="right") - grid.add_row( - f"[b]Node: [/b] {node_name} ", - datetime.now().ctime().replace(":", "[blink]:[/]"), - ) - return Panel(grid, style="green") - - class Node_Resources_Progress(PrometheusNodeMetrics): - def __init__(self): - super().__init__() - self.progress_start() - - def progress_start(self): - # node_metrics_json = self.nodeMetrics(node=node_name) - # node_mem_metrics_json = node_metrics_json.get('memory') - # node_cpu_metrics_json = node_metrics_json.get('cpu') - # node_fs_metrics_json = node_metrics_json.get('fs') - - - self.progress_threads_status = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TextColumn("[progress.percentage]{task.percentage:>3.0f}"), - TextColumn("{task.fields[status]}"), - ) - self.task_thread_refresh = self.progress_threads_status.add_task(description=f"[white]Metrics Refresh", status=f"unknown") - self.task_prometheus_server_connection = self.progress_threads_status.add_task(description=f"[white]Prometheus", status=f"unknown") - - self.progress_mem_total = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TextColumn("[progress.percentage]{task.percentage:>3.0f}"), - TextColumn("{task.fields[status]}"), - ) - # if node_mem_metrics_json.get('MemTotalBytes').get('success'): - self.task_mem_total = self.progress_mem_total.add_task(description=f"[white]Mem Total ", status="Loading") - - self.progress_mem = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - - # if (node_mem_metrics_json.get('MemTotalBytes').get('success') and node_mem_metrics_json.get('MemAvailableBytes').get('success')): - self.task_mem_used = self.progress_mem.add_task(completed=0, description=f"[white]Mem used", total=100, status="Loading") - # if node_mem_metrics_json.get('MemAvailableBytes').get('success'): - # self.task_mem_available = self.progress_mem.add_task(completed=0, description=f"[white]Mem available", total=100, status="Loading") - # if node_mem_metrics_json.get('MemFreeBytes').get('success'): - self.task_mem_free = self.progress_mem.add_task(completed=0, description=f"[white]Mem free", total=100, status="Loading") - # if node_mem_metrics_json.get('MemCachedBytes').get('success'): - self.task_mem_cached = self.progress_mem.add_task(completed=0, description=f"[white]Mem cached ", total=100, status="Loading") - # if node_mem_metrics_json.get('MemBuffersBytes').get('success'): - self.task_mem_buffer = self.progress_mem.add_task(completed=0, description=f"[white]Mem buffer ", total=100, status="Loading") - - self.progress_swap = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_mem_metrics_json.get('MemSwapTotalBytes').get('success'): - self.task_swap_total = self.progress_swap.add_task(completed=0, description=f"[white]Swap Total ", total=100, status="Loading") - # if node_mem_metrics_json.get('MemSwapTotalBytes').get('success'): - self.task_swap_free = self.progress_swap.add_task(completed=0, description=f"[white]Swap free ", total=100, status="Loading") - # if node_mem_metrics_json.get('MemSwapCachedBytes').get('success'): - self.task_swap_cached = self.progress_swap.add_task(completed=0, description=f"[white]Swap cached ", total=100, status="Loading") - - self.progress_cpu_used_avg = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_cpu_metrics_json.get('cpuUsageAVG').get('success'): - self.task_cpu_used_avg = self.progress_cpu_used_avg.add_task(description="CPU used AVG[10m]", completed=0, total=100, status="Loading") - - self.progress_cpu = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_cpu_metrics_json.get('cpuLoadAvg1m').get('success'): - self.task_cpu_load1avg = self.progress_cpu.add_task(description=f"[white]CPU load avg 1m ", status="Loading") - self.task_cpu_load5avg = self.progress_cpu.add_task(description=f"[white]CPU load avg 5m ", status="Loading") - self.task_cpu_load15avg = self.progress_cpu.add_task(description=f"[white]CPU load avg 15m ", status="Loading") - - - self.progress_fs_total = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_fs_metrics_json.get('nodeFsSize').get('success'): - self.task_fs_size_total = self.progress_fs_total.add_task(description=f"[white]FS Total ", status="Loading") - - self.progress_fs = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - # if node_fs_metrics_json.get('nodeFsUsed').get('success'): - self.task_fs_used = self.progress_fs.add_task(completed=0, description=f"[white]FS used ", total=100, status="Loading") - - # if node_fs_metrics_json.get('nodeFsAvailable').get('success'): - self.task_fs_available = self.progress_fs.add_task(completed=0, description=f"[white]FS available ", total=100, status="Loading") - - - - self.group_memory = Group ( - self.progress_mem_total, - self.progress_mem, - Rule(style='#AAAAAA'), - self.progress_swap, - ) - - self.group_cpu = Group ( - self.progress_cpu_used_avg, - self.progress_cpu - ) - - self.group_fs = Group ( - self.progress_fs_total, - self.progress_fs - ) - - def update(self): - time.sleep(3) - while True: - Logging.log.info("Getting node metrics to update the dashboard") - node_metrics_json = self.nodeMetrics(node=node_name) - if GlobalAttrs.debug: - Logging.log.info("Node metrics Json:") - Logging.log.debug(node_metrics_json) - node_mem_metrics_json = node_metrics_json.get('memory') - node_cpu_metrics_json = node_metrics_json.get('cpu') - node_fs_metrics_json = node_metrics_json.get('fs') - - self.progress_mem_total.update(self.task_mem_total, description=f"[white]Mem Total ", status=f" {helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemTotalBytes').get('result'))}") - self.progress_mem.update(self.task_mem_used, completed=node_mem_metrics_json.get('MemTotalBytes').get('result') - (node_mem_metrics_json.get('MemFreeBytes').get('result')), description=f"[white]Mem used", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemTotalBytes').get('result') - (node_mem_metrics_json.get('MemFreeBytes').get('result') + node_mem_metrics_json.get('MemBuffersBytes').get('result') + node_mem_metrics_json.get('MemCachedBytes').get('result')))}") - # self.progress_mem.update(self.task_mem_available, completed=node_mem_metrics_json.get('MemAvailableBytes').get('result'), description=f"[white]Mem available", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemAvailableBytes').get('result'))}") - self.progress_mem.update(self.task_mem_free, completed=node_mem_metrics_json.get('MemFreeBytes').get('result'), description=f"[white]Mem free", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemFreeBytes').get('result'))}") - self.progress_mem.update(self.task_mem_cached, completed=node_mem_metrics_json.get('MemCachedBytes').get('result'), description=f"[white]Mem cached ", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemCachedBytes').get('result'))}") - self.progress_mem.update(self.task_mem_buffer, completed=node_mem_metrics_json.get('MemBuffersBytes').get('result'), description=f"[white]Mem buffer ", total=node_mem_metrics_json.get('MemTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemBuffersBytes').get('result'))}") - - self.progress_swap.update(self.task_swap_total, completed=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), description=f"[white]Swap Total ", total=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemSwapTotalBytes').get('result'))}") - self.progress_swap.update(self.task_swap_free, completed=node_mem_metrics_json.get('MemSwapFreeBytes').get('result'), description=f"[white]Swap free ", total=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemSwapFreeBytes').get('result'))}") - self.progress_swap.update(self.task_swap_cached, completed=node_mem_metrics_json.get('MemSwapCachedBytes').get('result'), description=f"[white]Swap cached ", total=node_mem_metrics_json.get('MemSwapTotalBytes').get('result'), status=f"{helper_.bytes_to_kb_mb_gb(node_mem_metrics_json.get('MemSwapCachedBytes').get('result'))}") - - self.progress_cpu_used_avg.update(self.task_cpu_used_avg, completed=(node_cpu_metrics_json.get('cpuUsageAVG').get('result') / 2), description=f"[white]CPU used AVG[10m] ", total=100, status="") - self.progress_cpu.update(self.task_cpu_load1avg, description=f"[white]CPU load avg 1m ", status=node_cpu_metrics_json.get('cpuLoadAvg1m').get('result')) - self.progress_cpu.update(self.task_cpu_load5avg, description=f"[white]CPU load avg 5m ", status=node_cpu_metrics_json.get('cpuLoadAvg5m').get('result')) - self.progress_cpu.update(self.task_cpu_load15avg, description=f"[white]CPU load avg 15m ", status=node_cpu_metrics_json.get('cpuLoadAvg15m').get('result')) - - self.progress_fs_total.update(self.task_fs_size_total, description=f"[white]FS Total ", status=helper_.bytes_to_kb_mb_gb(node_fs_metrics_json.get('nodeFsSize').get('result'))) - self.progress_fs.update(self.task_fs_used, completed=node_fs_metrics_json.get('nodeFsUsed').get('result'), description=f"[white]FS used ", total=node_fs_metrics_json.get('nodeFsSize').get('result'), status=helper_.bytes_to_kb_mb_gb(node_fs_metrics_json.get('nodeFsUsed').get('result'))) - self.progress_fs.update(self.task_fs_available, completed=node_fs_metrics_json.get('nodeFsAvailable').get('result'), description=f"[white]FS available ", total=node_fs_metrics_json.get('nodeFsSize').get('result'), status=helper_.bytes_to_kb_mb_gb(node_fs_metrics_json.get('nodeFsAvailable').get('result'))) - - Logging.log.debug(f"Waiting for interval '{GlobalAttrs.live_update_interval}' before the next update") - time.sleep(GlobalAttrs.live_update_interval) - - def check_thread_node_resources(self, restart=True): - while True: - def thread_status(): - status = "" - if self.thread_node_resources.is_alive(): - status = f"alive [green]✔️" - else: - status = "dead [red]❌" - if restart: - # Restart thread - self.start_threads() - return status - - self.progress_threads_status.update(task_id=self.task_thread_refresh, status=thread_status()) - time.sleep(5) - - class ValidatePrometheuesConnection(PrometheusNodeMetrics): - def __init__(self): - super().__init__() - self.result = {} - - def run(self): - while True: - time.sleep(5) - self.result = self.verify_prometheus_connection() - if GlobalAttrs.debug: - print("DEBUG -- Function: ValidatePrometheuesConnection") - Logging.log.info("Function: ValidatePrometheuesConnection") - Logging.log.info("Function: ValidatePrometheuesConnection, waiting for internal '5s' ") - - def check_thread_prometheus_server_connection(self): - while True: - - def thread_status(): - result = self.vlaidate_prometheus_server.result - # if self.thread_check_thread_prometheus_server_connection.is_alive(): - if result.get('connected') is None: - status = f"waiting [green]✔️" - elif result.get('connected'): - status = f"connected [green]✔️" - else: - status = f"{result.get('reason')} [red]❌" - - return status - - self.progress_threads_status.update(task_id=self.task_prometheus_server_connection, status=f"{thread_status()} ({self.vlaidate_prometheus_server.result.get('status_code')})") - time.sleep(5) - - def start_threads(self): - self.thread_node_resources = threading.Thread(target=self.update) - self.thread_node_resources.daemon = True - self.thread_node_resources.start() - Logging.log.debug("Started Thread: thread_node_resources") - - self.vlaidate_prometheus_server = self.ValidatePrometheuesConnection() - self.thread_prometheus_server_connection = threading.Thread(target=self.vlaidate_prometheus_server.run) - self.thread_prometheus_server_connection.daemon = True - self.thread_prometheus_server_connection.start() - Logging.log.debug("Started Thread: thread_prometheus_server_connection") - - def watch_threads(self): - self.thread_check_thread_node_resources = threading.Thread(target=self.check_thread_node_resources) - self.thread_check_thread_node_resources.daemon = True - self.thread_check_thread_node_resources.start() - - self.thread_check_thread_prometheus_server_connection = threading.Thread(target=self.check_thread_prometheus_server_connection) - self.thread_check_thread_prometheus_server_connection.daemon = True - self.thread_check_thread_prometheus_server_connection.start() - - - try: - # node_metrics = PrometheusNodeMetrics() - node_resources_progress = Node_Resources_Progress() - - progress_table = Table.grid(expand=True) - progress_table.add_row( - Panel(node_resources_progress.group_cpu, title="[b]CPU", padding=(1, 2)), - ) - progress_table.add_row( - Panel(node_resources_progress.group_memory, title="[b]Memory", padding=(1, 2)), - ) - progress_table.add_row( - Panel(node_resources_progress.group_fs, title='[b]FS "/"', padding=(1, 2)), - ) - progress_table.add_row( - Panel(node_resources_progress.progress_threads_status, title="[b]Threads Status",padding=(1, 2), subtitle=""), - ) - - - - layout = make_layout() - layout["header"].update(Header()) - # layout["body1_a"].update(progress_table) - - - # layout["body2_a"].update(Panel("Loading ...", title="[b]Top Pods in Memory Usage", padding=(1, 1))) - - node_resources_progress.start_threads() - node_resources_progress.watch_threads() - - update_disk_read_bytes_graph = True - disk_read_bytes_graph = AsciiGraph() - disk_read_bytes = self.nodeDiskReadBytes(node_name) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'disk_read_bytes' metrics; Result:\n{disk_read_bytes}") - else: - Logging.log.info("Getting Pod 'disk_read_bytes' metrics") - if disk_read_bytes.get('success'): - disk_read_bytes_graph.create_graph(disk_read_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - disk_read_bytes_graph.graph = disk_read_bytes.get('fail_reason') - update_disk_read_bytes_graph = False - - update_network_received_bytes_graph = True - network_received_bytes_graph = AsciiGraph() - network_received_bytes = self.nodeNetworkReceiveBytes(node_name) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'network_received_bytes' metrics; Result:\n{network_received_bytes}") - else: - Logging.log.info("Getting Pod 'network_received_bytes' metrics") - if network_received_bytes.get('success'): - network_received_bytes_graph.create_graph(network_received_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - network_received_bytes_graph.graph = network_received_bytes.get('fail_reason') - update_network_received_bytes_graph = False - - update_network_transmit_bytes_graph = True - network_transmit_bytes_graph = AsciiGraph() - network_transmit_bytes = self.nodeNetworkTransmitBytes(node_name) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'network_transmit_bytes' metrics; Result:\n{network_transmit_bytes}") - else: - Logging.log.info("Getting Pod 'network_transmit_bytes' metrics") - if network_transmit_bytes.get('success'): - network_transmit_bytes_graph.create_graph(network_transmit_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - network_transmit_bytes_graph.graph = network_transmit_bytes.get('fail_reason') - update_network_transmit_bytes_graph = False - - - - update_disk_written_bytes_graph = True - disk_written_bytes_graph = AsciiGraph() - disk_written_bytes = self.nodeDiskWrittenBytes(node_name) - if disk_written_bytes.get('success'): - disk_written_bytes_graph.create_graph(disk_written_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width, format='{:8.0f} kb/s') - else: - disk_written_bytes_graph.graph = disk_written_bytes.get('fail_reason') - update_disk_written_bytes_graph = False - - # layout["body2_b_b"].update(Panel(Markdown("Loading ..."), title="[b]Network IO", padding=(1, 1))) - # layout["body2_b_a"].update(Panel(Markdown("Loading ..."), title="[b]Disk IO", padding=(1, 1))) - - group_network_io = Group( - Markdown("Bytes Received", justify='center'), - Text.from_ansi(network_received_bytes_graph.graph + f"\n {network_received_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Transmitted", justify='center'), - Text.from_ansi(network_transmit_bytes_graph.graph + f"\n {network_transmit_bytes_graph.colors_description_str}") - ) - - group_disk_io = Group( - Markdown("Bytes Read", justify='center'), - Text.from_ansi(disk_read_bytes_graph.graph + f"\n {disk_read_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Written", justify='center'), - Text.from_ansi(disk_written_bytes_graph.graph + f"\n {disk_written_bytes_graph.colors_description_str}") - ) - - Logging.log.info("Starting the Layout.") - with Live(layout, auto_refresh=True, screen=True, refresh_per_second=GlobalAttrs.live_update_interval): - while True: - # pod_memory_usage = node_metrics.PodMemTopUsage(node=node_name) - # layout["body2_a"].update(Panel(pod_memory_usage, title="[b]Top Pods in Memory Usage", padding=(1, 1))) - # Logging.log.info("Updating the Layout with 'Top Pods in Memory Usage'") - # Logging.log.info(f"Result:\n{pod_memory_usage}") - - # if update_network_received_bytes_graph: - # network_received_bytes = self.nodeNetworkReceiveBytes(node_name) - # Logging.log.info("Updating Node 'network_received_bytes' metrics") - # Logging.log.info(network_received_bytes) - # for device, value in network_received_bytes.get('result').items(): - # network_received_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - # if update_network_transmit_bytes_graph: - # Logging.log.info("Updating Node 'network_transmit_bytes' metrics") - # Logging.log.info(network_transmit_bytes) - # network_transmit_bytes = self.nodeNetworkTransmitBytes(node_name) - # for device, value in network_transmit_bytes.get('result').items(): - # network_transmit_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - # if update_disk_read_bytes_graph: - # disk_read_bytes = self.nodeDiskReadBytes(node_name) - # Logging.log.info("Updating Node 'disk_read_bytes' metrics") - # Logging.log.info(disk_read_bytes) - # for device, value in disk_read_bytes.get('result').items(): - # disk_read_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - # if update_disk_written_bytes_graph: - # disk_written_bytes = self.nodeDiskWrittenBytes(node_name) - # Logging.log.info("Updating Node 'disk_written_bytes' metrics") - # Logging.log.info(disk_written_bytes) - # for device, value in disk_written_bytes.get('result').items(): - # disk_written_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - # if update_network_received_bytes_graph or update_network_transmit_bytes_graph: - # group_network_io = Group( - # Markdown("Bytes Received", justify='center'), - # Text.from_ansi(network_received_bytes_graph.graph + f"\n {network_received_bytes_graph.colors_description_str}"), - # Rule(style='#AAAAAA'), - # Markdown("Bytes Transmitted", justify='center'), - # Text.from_ansi(network_transmit_bytes_graph.graph + f"\n {network_transmit_bytes_graph.colors_description_str}") - # ) - - # if update_disk_read_bytes_graph or update_disk_written_bytes_graph: - # group_disk_io = Group( - # Markdown("Bytes Read", justify='center'), - # Text.from_ansi(disk_read_bytes_graph.graph + f"\n {disk_read_bytes_graph.colors_description_str}"), - # Rule(style='#AAAAAA'), - # Markdown("Bytes Written", justify='center'), - # Text.from_ansi(disk_written_bytes_graph.graph + f"\n {disk_written_bytes_graph.colors_description_str}") - # ) - - # layout["body2_b_b"].update(Panel(group_network_io, title="[b]Network IO", padding=(1, 1))) - # layout["body2_b_a"].update(Panel(group_disk_io, title="[b]Disk IO", padding=(1, 1))) - - Logging.log.info(f"waiting for the update interval '{GlobalAttrs.live_update_interval}' before updating the Layout ") - time.sleep(GlobalAttrs.live_update_interval) - Logging.log.info(f"Updating the layout") - - except Exception as e: - rich.print("\n[yellow]ERROR -- " + str(e)) - rich.print("\n[underline bold]Exception:") - traceback.print_exc() - exit(1) - except KeyboardInterrupt: - print(" ", end="\r") - rich.print("Ok") - exit(0) - - - def node_monitor_dashboard_memory(self, node_name): - print("not implemented yet.") - exit(0) - - - - diff --git a/kubePtop/pod_metrics.py b/kubePtop/pod_metrics.py deleted file mode 100644 index 47566fb..0000000 --- a/kubePtop/pod_metrics.py +++ /dev/null @@ -1,1214 +0,0 @@ -from kubePtop.session import PrometheusAPI -from kubePtop.global_attrs import GlobalAttrs -from kubePtop.logging import Logging -from kubePtop.helper import Helper -from kubePtop.colors import Bcolors -bcolors = Bcolors() -helper_ = Helper() -from tabulate import tabulate -import textwrap -# import rich -import math -import traceback - -class PrometheusPodsMetrics(PrometheusAPI): - def __init__(self): - super().__init__() - - - def podExists(self, pod, namespace="default"): - """ - Check if the pod exists - Returns (Bolean) True if yes, False if no - """ - output = { - "success": False, - "fail_reason": "", - "result": False - } - try: - query = f'sum(container_last_seen{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = True - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - - def podMetrics(self, pod, node=".*", container=".*", namespace="default"): - output = {} - - output['cpu'] = { - 'cpuLoadAvg10s': self.podCpuLoadAvg_10s(pod, node, container, namespace), - 'cpuUsageAVG10mMilicores': self.podCpuUsageAvg_10m(pod, node, container, namespace), - 'cpuUsageSystemAVG10mMilicores': self.podCpuUsageSystemAvg_10m(pod, node, container, namespace), - 'cpuUsageUserAVG10mMilicores': self.podCpuUsageUserAvg_10m(pod, node, container, namespace), - 'cpuQuotaMilicores': self.podCpuLimit(pod, node, container, namespace), - } - - output['memory'] = { - 'MemLimitBytes': self.podMemLimit(pod, node, container, namespace), # total, - 'MemCachedBytes': self.podMemCache(pod, node, container, namespace), - 'MemUsageBytes': self.podMemUsage(pod, node, container, namespace), - 'MemUsageMaxBytes': self.podMemUsageMax(pod, node, container, namespace), - } - - return output - - def podMemUsage(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return Pod memory usage in bytes - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(container_memory_working_set_bytes{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - # Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - # Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podMemUsagePerContainers(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return Pod memory usage in bytes (per container) - Sample Return: - {'success': True, 'fail_reason': '', 'result': {'cp-kafka-broker': 18870292480.0, 'prometheus-jmx-exporter': 212209664.0}} - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - query = f'sum(container_memory_working_set_bytes{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace, container)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - for container in result.get('data').get('result'): - output['result'][container.get('metric').get('container')] = float(container.get('value')[1]) - - output['success'] = True - - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podMemUsagePerContainers_range(self, pod=".*", node=".*", container=".*", namespace="default", range_="3h"): - """ - Return Pod memory usage in bytes (per container) - Sample Return: - -> Returns the data in Megabytes - """ - output = { - "success": False, - "fail_reason": "", - "result": [] - } - try: - query = f'container_memory_working_set_bytes{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}[{range_}]' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - # for container in result.get('data').get('result'): - # output['result'][container.get('metric').get('container')] = container.get('value')[1] - - # output['result'] = result.get('data').get('result')[0].get('values') - timestamp_value = result.get('data').get('result')[0].get('values') - for i in timestamp_value: - output['result'].append(round(helper_.bytes_to_mb(float(i[1])))) - - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podMemUsageMax(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return Pod memory usage in bytes - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(container_memory_max_usage_bytes{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podMemLimit(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return Pod memory usage in bytes - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(container_spec_memory_limit_bytes{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podMemCache(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return Pod memory usage in bytes - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(container_memory_cache{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podSwapLimit(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return Pod memory usage in bytes - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(container_spec_memory_swap_limit_bytes{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podCpuLoadAvg_10s(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return Pod memory usage in bytes - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(container_cpu_load_average_10s{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = int(result.get('data').get('result')[0].get('value')[1]) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podCpuUsageAvg_10m(self, pod=".*", node=".*", container=".*", namespace="default", avg="10m"): - """ - Return number of CPU seconds used per pods. - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(container_cpu_usage_seconds_total{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}[{avg}])) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = math.ceil(float(result.get('data').get('result')[0].get('value')[1])) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podCpuUsageSystemAvg_10m(self, pod=".*", node=".*", container=".*", namespace="default", avg="10m"): - """ - Return number of CPU seconds used per pods. - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(container_cpu_system_seconds_total{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}[{avg}])) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = math.ceil(float(result.get('data').get('result')[0].get('value')[1])) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podCpuUsageUserAvg_10m(self, pod=".*", node=".*", container=".*", namespace="default", avg="10m"): - """ - Return number of CPU seconds used per pods. - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(container_cpu_user_seconds_total{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}[{avg}])) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - output['result'] = math.ceil(float(result.get('data').get('result')[0].get('value')[1])) - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podCpuLimit(self, pod=".*", node=".*", container=".*", namespace="default"): - """ - Return number of CPU seconds used per pods. - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(container_spec_cpu_quota{{image!="", container!="", container!="POD", namespace=~"{namespace}", pod=~"{pod}", container=~"{container}", {GlobalAttrs.kubernetes_exporter_node_label}=~"{node}"}}) by (pod, instance, namespace)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - result = int(result.get('data').get('result')[0].get('value')[1]) - if result > 0: - result = result // 10 - result = result // 10 - output['result'] = result - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podPVC(self, pod=".*", namespace="default"): - """ - Return number of CPU seconds used per pods. - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - # Get PVCs Names used by the Pod. - pvcs_names_query = f'sum(kube_pod_spec_volumes_persistentvolumeclaims_info{{namespace=~"{namespace}", pod=~"{pod}", container=~".*"}}) by (namespace, persistentvolumeclaim, volume, pod)' - pvc_names_result = self.run_query(pvcs_names_query) - if not pvc_names_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_names_query}" - return output - if not pvc_names_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_names_query}" - return output - - pvcs_dct = {} - for pvc in pvc_names_result.get('data').get('result'): - pvcs_dct[pvc.get('metric').get('persistentvolumeclaim')] = { - "namespace": pvc.get('metric').get('namespace'), - "pod": pvc.get('metric').get('pod'), - "volume": pvc.get('metric').get('volume'), - "capacity": -1, - "used": -1, - "available": -1, - } - - for pvc in pvcs_dct.keys(): - - # Get PVCs capacity - pvcs_capacity_query = f'sum(kubelet_volume_stats_capacity_bytes{{persistentvolumeclaim=~"{pvc}"}}) by (persistentvolumeclaim, namespace)' - pvcs_names_result = self.run_query(pvcs_capacity_query) - if not pvcs_names_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_capacity_query}" - return output - if not pvcs_names_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_capacity_query}" - # return output - if pvcs_names_result.get('data').get('result'): - pvcs_dct[pvc]['capacity'] = int(pvcs_names_result.get('data').get('result')[0].get('value')[1]) - - # Get PVCs used - pvcs_used_query = f'sum(kubelet_volume_stats_used_bytes{{persistentvolumeclaim=~"{pvc}"}}) by (persistentvolumeclaim, namespace)' - pvcs_used_result = self.run_query(pvcs_used_query) - if not pvcs_used_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_used_query}" - return output - if not pvcs_used_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_used_query}" - # return output - if pvcs_used_result.get('data').get('result'): - pvcs_dct[pvc]['used'] = int(pvcs_used_result.get('data').get('result')[0].get('value')[1]) - - # Get PVCs used - pvcs_available_query = f'sum(kubelet_volume_stats_available_bytes{{persistentvolumeclaim=~"{pvc}"}}) by (persistentvolumeclaim, namespace)' - pvcs_available_result = self.run_query(pvcs_available_query) - if not pvcs_available_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_available_query}" - return output - if not pvcs_available_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_available_query}" - # return output - if pvcs_available_result.get('data').get('result'): - pvcs_dct[pvc]['available'] = int(pvcs_available_result.get('data').get('result')[0].get('value')[1]) - - output['result'] = pvcs_dct - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podNetworkReceiveBytes(self, pod=".*", namespace="default"): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - query = f'sum(irate(container_network_receive_bytes_total{{container!="", namespace=~"{namespace}", pod=~"{pod}"}}[10m])) by (pod, instance, namespace, interface)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - interfaces = {} - for interface in result.get('data').get('result'): - interfaces[interface.get('metric').get('interface')] = float(interface.get('value')[1]) - - output['result'] = interfaces - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podNetworkTransmitBytes(self, pod=".*", namespace="default"): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - query = f'sum(irate(container_network_transmit_bytes_total{{container!="", namespace=~"{namespace}", pod=~"{pod}"}}[10m])) by (pod, instance, namespace, interface)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - interfaces = {} - for interface in result.get('data').get('result'): - interfaces[interface.get('metric').get('interface')] = float(interface.get('value')[1]) - - output['result'] = interfaces - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podPVC_table(self, pod, namespace="default"): - """ - """ - pod_pvcs_dct = self.podPVC(pod, namespace) - if not pod_pvcs_dct.get('success'): - return " " # pod_pvcs_dct.get('fail_reason') - - if len(pod_pvcs_dct.get('result')) < 1: - return "No PVCs used by the pod" - - table = [['PVC', 'CAPACITY', 'USED', 'AVAILABLE']] - for pvc, value in pod_pvcs_dct.get('result').items(): - pvc_name = "\n".join(textwrap.wrap(pvc, width=23, replace_whitespace=False)) - - if value.get('capacity') != -1: - capacity = helper_.bytes_to_kb_mb_gb(value.get('capacity')) - else: - capacity = "?" - - if value.get('used') != -1: - used = helper_.bytes_to_kb_mb_gb(value.get('used')) - else: - used = "?" - - if value.get('available') != -1: - available = helper_.bytes_to_kb_mb_gb(value.get('available')) - else: - available = "?" - - row = [pvc_name, capacity, used, available] - table.append(row) - - out = tabulate(table, headers='firstrow', tablefmt='plain', showindex=False) - return out - - def podUpTime(self, pod=".*", namespace="default", container=".*"): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": 0 - } - try: - query = f'sum(time() - container_start_time_seconds{{pod="{pod}", container=~"{container}", namespace=~"{namespace}", container!="POD", image!=""}}) by (pod, instance, namespace, container)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - interfaces = {} - for interface in result.get('data').get('result'): - interfaces[interface.get('metric').get('interface')] = float(interface.get('value')[1]) - - output['result'] = float(result.get('data').get('result')[0].get('value')[1]) - - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podFileDescriptors(self, pod=".*", namespace="default", container=".*"): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": 0 - } - try: - query = f'sum(container_file_descriptors{{pod="{pod}", container=~"{container}", namespace=~"{namespace}", container!="POD", image!=""}}) by (pod, instance, namespace, container)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - interfaces = {} - for interface in result.get('data').get('result'): - interfaces[interface.get('metric').get('interface')] = float(interface.get('value')[1]) - - output['result'] = float(result.get('data').get('result')[0].get('value')[1]) - - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podThreads(self, pod=".*", namespace="default", container=".*"): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": 0 - } - try: - query = f'sum(container_threads{{pod="{pod}", container=~"{container}", namespace=~"{namespace}", container!="POD", image!=""}}) by (pod, instance, namespace, container)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - interfaces = {} - for interface in result.get('data').get('result'): - interfaces[interface.get('metric').get('interface')] = float(interface.get('value')[1]) - - output['result'] = float(result.get('data').get('result')[0].get('value')[1]) - - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - def podProcesses(self, pod=".*", namespace="default", container=".*"): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": 0 - } - try: - query = f'sum(container_processes{{pod="{pod}", container=~"{container}", namespace=~"{namespace}", container!="POD", image!=""}}) by (pod, instance, namespace, container)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - interfaces = {} - for interface in result.get('data').get('result'): - interfaces[interface.get('metric').get('interface')] = float(interface.get('value')[1]) - - output['result'] = float(result.get('data').get('result')[0].get('value')[1]) - - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podStartTime(self, pod=".*", namespace="default", container=".*"): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": 0 - } - try: - query = f'sum(container_start_time_seconds{{pod="{pod}", container!="POD", image!="", namespace=~"{namespace}", container=~"{container}"}}) by (pod, namespace, device, container)' - result = self.run_query(query) - - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {query}" - Logging.log.error(f"could not get metric's value: {query}") - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {query}" - Logging.log.error(f"Query did not return any data: {query}") - return output - - interfaces = {} - for interface in result.get('data').get('result'): - interfaces[interface.get('metric').get('interface')] = float(interface.get('value')[1]) - - output['result'] = float(result.get('data').get('result')[0].get('value')[1]) - - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def podDiskReadBytes(self, pod, container=".*", namespace="default"): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(container_fs_reads_bytes_total{{pod="{pod}", namespace=~"{namespace}", container=~"{container}"}}[10m])) by (pod, namespace, device)' - result = self.run_query(query) - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - devices = {} - for device in result.get('data').get('result'): - devices[device.get('metric').get('device')] = float(device.get('value')[1]) - - output['result'] = devices - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def podDiskWriteBytes(self, pod, container=".*", namespace="default"): - """ - INPUT: - - K8s node name (str) - Return: - - metric (dct) - """ - output = { - "success": False, - "fail_reason": "", - "result": "" - } - try: - query = f'sum(irate(container_fs_writes_bytes_total{{pod="{pod}", namespace=~"{namespace}", container=~"{container}"}}[10m])) by (pod, namespace, device)' - result = self.run_query(query) - if not result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: \n{query}" - return output - - if not result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{query}" - return output - - devices = {} - for device in result.get('data').get('result'): - devices[device.get('metric').get('device')] = float(device.get('value')[1]) - - output['result'] = devices - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - def topPod(self, namespace="default", sort_by_mem_usage=False): - """ - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - - try: - memory_limit_query = f'sum(container_spec_memory_limit_bytes{{namespace=~"{namespace}", image!="", container!="", container!="POD"}}) by (pod, instance, namespace)' - - memory_usage_query = f'sum(container_memory_working_set_bytes{{namespace=~"{namespace}", image!="", container!="", container!="POD"}}) by (pod, instance, namespace)' - if sort_by_mem_usage: - memory_usage_query = f'sort_desc(sum(container_memory_working_set_bytes{{namespace=~"{namespace}", image!="", container!="", container!="POD"}}) by (pod, instance, namespace))' - - memory_usage_max_query = f'sum(container_memory_max_usage_bytes{{namespace=~"{namespace}", image!="", container!="", container!="POD"}}) by (pod, instance, namespace)' - cpu_limit_query = f'sum(container_spec_cpu_quota{{namespace=~"{namespace}", image!="", container!="", container!="POD"}}) by (pod, instance, namespace)' - cpu_usage_query = f'sum(irate(container_cpu_usage_seconds_total{{namespace=~"{namespace}", image!="", container!="", container!="POD"}}[10m])) by (pod, instance, namespace)' - - memory_limit = self.run_query(memory_limit_query) - if not memory_limit.get('status') == 'success': - output['fail_reason'] = f"could not get metric value: \n{memory_limit_query}" - return output - - # if not memory_limit.get('data').get('result'): - # output['fail_reason'] = f"Query did not return any data: \n{memory_limit_query}" - # return output - - memory_usage = self.run_query(memory_usage_query) - if not memory_usage.get('status') == 'success': - output['fail_reason'] = f"could not get metric value: \n{memory_usage_query}" - return output - - if not memory_usage.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{memory_usage_query}" - return output - - memory_usage_max = self.run_query(memory_usage_max_query) - if not memory_usage_max.get('status') == 'success': - output['fail_reason'] = f"could not get metric value: \n{memory_usage_max_query}" - return output - - if not memory_usage_max.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{memory_usage_max_query}" - return output - - cpu_limit = self.run_query(cpu_limit_query) - if not cpu_limit.get('status') == 'success': - output['fail_reason'] = f"could not get metric value: \n{cpu_limit_query}" - return output - - # if not cpu_limit.get('data').get('result'): - # output['fail_reason'] = f"Query did not return any data: \n{cpu_limit_query}" - # return output - - cpu_usage = self.run_query(cpu_usage_query) - if not cpu_usage.get('status') == 'success': - output['fail_reason'] = f"could not get metric value: \n{cpu_usage_query}" - return output - - if not cpu_usage.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: \n{cpu_usage_query}" - return output - - dct = {} - if (len(memory_usage.get('data').get('result')) > 0) and (len(memory_limit.get('data').get('result')) > 0) and (len(memory_usage.get('data').get('result')) > 0): - for pod_mem_usage in memory_usage.get('data').get('result'): - dct[pod_mem_usage.get('metric').get('pod')] = { - "namespace": pod_mem_usage.get('metric').get('namespace'), - "instance": pod_mem_usage.get('metric').get('instance'), - "memory_usage": int(pod_mem_usage.get('value')[1]), - "memory_usage_max": 0, - "memory_limit": 0, - "cpu_limit": 0 - } - try: - for pod_mem_limit in memory_limit.get('data').get('result'): - dct[pod_mem_limit.get('metric').get('pod')]["memory_limit"] = int(pod_mem_limit.get('value')[1]) - for pod_mem_usage_max in memory_usage_max.get('data').get('result'): - dct[pod_mem_usage_max.get('metric').get('pod')]["memory_usage_max"] = int(pod_mem_usage_max.get('value')[1]) - for pod_cpu_limit in cpu_limit.get('data').get('result'): - dct[pod_cpu_limit.get('metric').get('pod')]["cpu_limit"] = int(pod_cpu_limit.get('value')[1][:-2]) - for pod_cpu_usage in cpu_usage.get('data').get('result'): - dct[pod_cpu_usage.get('metric').get('pod')]["cpu_usage"] = float('%.2f' % float(pod_cpu_usage.get('value')[1])) - except Exception as e: - print(f"ERROR -- got an error while listing pods\n{e}") - traceback.print_exc() - - - output['result'] = dct - output['success'] = True - - except(KeyError, AttributeError) as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - return output - - - def topPodTable(self, namespace="default", sort_by_mem_usage=False): - """ - """ - pods_json = self.topPod(namespace=namespace, sort_by_mem_usage=sort_by_mem_usage) - if not pods_json.get('success'): - print(f"No pods found in the '{namespace}' namespace \n{bcolors.WARNING + pods_json.get('fail_reason') + bcolors.ENDC}") - exit(1) - # import rich - # rich.print_json(data=pods_json) - - table = [['NAMESPACE', 'POD', 'MEM LIMIT', 'MEM USAGE', 'MEM USAGE%', 'MEM USAGE MAX', 'MEM FREE', 'CPU LIMIT', 'CPU USAGE' ]] - for pod, value in pods_json.get('result').items(): - - # pvc_name = "\n".join(textwrap.wrap(pvc, width=23, replace_whitespace=False)) - - if int(value.get('memory_limit')) == 0: - memory_limit = "---" #"NO_LIMIT" - memory_free = "---" - memory_usage_percentage = "---" - else: - memory_limit = helper_.bytes_to_kb_mb_gb(value.get('memory_limit')) - - if value.get('memory_limit') - value.get('memory_usage') > 0: - memory_free = helper_.bytes_to_kb_mb_gb(value.get('memory_limit') - value.get('memory_usage')) - else: - memory_free = f"-{helper_.bytes_to_kb_mb_gb((value.get('memory_limit') - value.get('memory_usage')) * -1)}" - memory_usage_percentage = helper_.percentage(value.get('memory_usage'), value.get('memory_limit')) - - if int(value.get('cpu_limit')) == 0: - cpu_limit = "---" #"NO_LIMIT" - cpu_usage = "" - - else: - cpu_limit = str(value.get('cpu_limit')) + "m" - - - row = [value.get('namespace'), pod, memory_limit, helper_.bytes_to_kb_mb_gb(value.get('memory_usage')), memory_usage_percentage, helper_.bytes_to_kb_mb_gb(value.get('memory_usage_max')), memory_free, cpu_limit, str(value.get('cpu_usage')) + "m"] - table.append(row) - - out = tabulate(table, headers='firstrow', tablefmt='plain', showindex=False) - print(out) - - - def topPvc(self, pod=".*", namespace="default"): - """ - Return number of CPU seconds used per pods. - """ - output = { - "success": False, - "fail_reason": "", - "result": {} - } - try: - # Get PVCs Names used by the Pod. - pvcs_names_query = f'sum(kube_pod_spec_volumes_persistentvolumeclaims_info{{namespace=~"{namespace}", pod=~"{pod}", container=~".*"}}) by (namespace, persistentvolumeclaim, volume, pod)' - pvc_names_result = self.run_query(pvcs_names_query) - if not pvc_names_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_names_query}" - return output - if not pvc_names_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_names_query}" - return output - - pvcs_dct = {} - for pvc in pvc_names_result.get('data').get('result'): - pvcs_dct[pvc.get('metric').get('persistentvolumeclaim')] = { - "namespace": pvc.get('metric').get('namespace'), - "pod": pvc.get('metric').get('pod'), - "volume": pvc.get('metric').get('volume'), - "capacity": -1, - "used": -1, - "available": -1, - } - - - # Get PVCs capacity - pvcs_capacity_query = f'sum(kubelet_volume_stats_capacity_bytes{{namespace=~"{namespace}", persistentvolumeclaim=~".*"}}) by (persistentvolumeclaim, namespace)' - pvcs_capacity_result = self.run_query(pvcs_capacity_query) - # import rich - # rich.print(pvcs_capacity_result) - # exit(1) - if not pvcs_capacity_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_capacity_query}" - return output - if not pvcs_capacity_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_capacity_query}" - # return output - if pvcs_capacity_result.get('data').get('result'): - for pvc_json in pvcs_capacity_result.get('data').get('result'): - pvcs_dct[pvc_json.get('metric').get('persistentvolumeclaim')]['capacity'] = int(pvc_json.get('value')[1]) - - - # Get PVCs used - pvcs_used_query = f'sum(kubelet_volume_stats_used_bytes{{namespace=~"{namespace}", persistentvolumeclaim=~".*"}}) by (persistentvolumeclaim, namespace)' - pvcs_used_result = self.run_query(pvcs_used_query) - if not pvcs_used_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_used_query}" - return output - if not pvcs_used_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_used_query}" - # return output - if pvcs_used_result.get('data').get('result'): - for pvc_json in pvcs_used_result.get('data').get('result'): - pvcs_dct[pvc_json.get('metric').get('persistentvolumeclaim')]['used'] = int(pvc_json.get('value')[1]) - - # Get PVCs available - pvcs_available_query = f'sum(kubelet_volume_stats_available_bytes{{namespace=~"{namespace}", persistentvolumeclaim=~".*"}}) by (persistentvolumeclaim, namespace)' - pvcs_available_result = self.run_query(pvcs_available_query) - if not pvcs_available_result.get('status') == 'success': - output['fail_reason'] = f"could not get metric's value: {pvcs_available_query}" - return output - if not pvcs_available_result.get('data').get('result'): - output['fail_reason'] = f"Query did not return any data: {pvcs_available_query}" - # return output - if pvcs_available_result.get('data').get('result'): - for pvc_json in pvcs_available_result.get('data').get('result'): - pvcs_dct[pvc_json.get('metric').get('persistentvolumeclaim')]['available'] = int(pvc_json.get('value')[1]) - - output['result'] = pvcs_dct - output['success'] = True - - except Exception as e: - output['success']: False - output['fail_reason'] = e - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - return output - - - def topPvcTable(self, namespace="default"): - """ - """ - pvc_json = self.topPvc(namespace=namespace) - # import rich - # rich.print_json(data=pvc_json) - if not pvc_json.get('success'): - print(f"No pvc's found in the '{namespace}' namespace \n{bcolors.WARNING + str(pvc_json.get('fail_reason') ) + bcolors.ENDC}") - exit(1) - - table = [['NAMESPACE', 'PVC', 'VOLUME', 'CAPACITY', 'USED', 'USED%', 'FREE', 'FREE%' ]] - for pvc, value in pvc_json.get('result').items(): - - if value.get('capacity') != -1: - capacity = helper_.bytes_to_kb_mb_gb(value.get('capacity')) - else: - capacity = "?" - - if value.get('used') != -1: - used = helper_.bytes_to_kb_mb_gb(value.get('used')) - used_percentage = helper_.percentage(value.get('used'), value.get('capacity')) - else: - used = "?" - used_percentage = "?" - - if value.get('available') != -1: - available = helper_.bytes_to_kb_mb_gb(value.get('available')) - available_percentage = helper_.percentage(value.get('available'), value.get('capacity')) - else: - available = "?" - available_percentage = "?" - - row = [value.get('namespace'), pvc, value.get('volume'), capacity, used, used_percentage, available, available_percentage] - table.append(row) - out = tabulate(table, headers='firstrow', tablefmt='plain', showindex=False) - print(out) - diff --git a/kubePtop/pod_monitor.py b/kubePtop/pod_monitor.py deleted file mode 100644 index f192a57..0000000 --- a/kubePtop/pod_monitor.py +++ /dev/null @@ -1,580 +0,0 @@ -import time -from tabulate import tabulate -# import textwrap -from datetime import datetime, timezone -import threading -import rich -from rich.live import Live -from rich.table import Table -from rich.panel import Panel -from rich.progress import SpinnerColumn, Progress, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn, TimeElapsedColumn -from rich.layout import Layout -from rich.console import Console, Group -from rich.rule import Rule -from rich.console import Console -from rich.markdown import Markdown -from rich.text import Text -import traceback -from kubePtop.global_attrs import GlobalAttrs -from kubePtop.ascii_graph import AsciiGraph -from kubePtop.colors import Bcolors -bcolors = Bcolors() - - -# from kubePtop.global_attrs import GlobalAttrs -from kubePtop.pod_metrics import PrometheusPodsMetrics -from kubePtop.helper import Helper -helper_ = Helper() -from kubePtop.logging import Logging - - -class Pod_Monitoring(PrometheusPodsMetrics): - def __init__(self): - super().__init__() - - def pod_monitor(self, pod, node=".*", container=".*", namespace="default"): - # Print loading because the layout may take few seconds to start (Probably due to slow connection) - rich.print("[blink]Loading ...", end="\r") - - def make_layout() -> Layout: - """ - The layout structure - """ - layout = Layout(name="root") - - layout.split( - Layout(name="header", size=3), - # Layout(name="header2", size=7, ratio=1), - Layout(name="main", ratio=1), - # Layout(name="footer", size=6, ratio=1) - ) - layout["main"].split_row( - # Layout(name="side",), - Layout(name="body", ratio=3, minimum_size=100,), - ) - # layout["side"].split(Layout(name="box1")) # , Layout(name="box2") - # layout["body"].split(Layout(name="head", size=5, ratio=2), Layout(name="body1")) # , Layout(name="box2") - layout["body"].split_row(Layout(name="body1", size=55), Layout(name="body2"),) # , Layout(name="box2") - layout['body1'].split_column(Layout(name="body1_a"), Layout(name="body1_b", size=11)) - - layout["body2"].split(Layout(name="body2_a", ratio=1), Layout(name="body2_b", ratio=1)) # , Layout(name="box2") - layout['body2_b'].split_row(Layout(name="body2_b_a", ratio=1), Layout(name="body2_b_b", ratio=1)) - layout['body2_a'].split_row(Layout(name="body2_a_a", ratio=1), Layout(name="body2_a_b", ratio=1)) - - return layout - - class Header(): - """ - Display header with clock. - """ - def __rich__(self) -> Panel: - grid = Table.grid(expand=True) - grid.add_column(justify="center", ratio=1) - grid.add_column(justify="right") - grid.add_row( - f"[b]Pod: [/b] {pod} [b]Namespace: [/b] {namespace} [b]Container: [/b] {container}", - datetime.now().ctime().replace(":", "[blink]:[/]"), - ) - return Panel(grid, style="green") - - class Pod_Resources_Progress(PrometheusPodsMetrics): - def __init__(self): - super().__init__() - self.mem_total_bytes = 0 - self.cpu_limit = 0 - self.progress_start() - - def progress_start(self): - self.progress_threads_status = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TextColumn("[progress.percentage]{task.percentage:>3.0f}"), - TextColumn("{task.fields[status]}"), - ) - self.task_thread_refresh = self.progress_threads_status.add_task(description=f"[white]Metrics Refresh", status=f"unknown") - self.task_prometheus_server_connection = self.progress_threads_status.add_task(description=f"[white]Prometheus", status=f"unknown") - - - self.progress_mem_total = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TextColumn("[progress.percentage]{task.percentage:>3.0f}"), - TextColumn("{task.fields[status]}"), - ) - self.task_mem_total = self.progress_mem_total.add_task(description=f"[white]Mem Limit ", status=" Loading") - - self.progress_mem = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - self.task_mem_used = self.progress_mem.add_task(completed=0, description=f"[white]Mem used ", total=self.mem_total_bytes, status="Loading") - self.task_mem_used_max = self.progress_mem.add_task(completed=0, description=f"[white]Mem used max ", total=self.mem_total_bytes, status="Loading") - self.task_mem_cached = self.progress_mem.add_task(completed=0, description=f"[white]Mem cached ", total=self.mem_total_bytes, status="Loading") - - - self.progress_cpu_load_avg = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - # TextColumn("[progress.percentage]{task.percentage:>3.0f}"), - TextColumn("{task.fields[status]}"), - ) - self.task_cpu_limit = self.progress_cpu_load_avg.add_task(description=f"[white]CPU Limit ", status="Loading") - self.task_cpu_load_avg_10s = self.progress_cpu_load_avg.add_task(description=f"[white]CPU load avg 10s", status="Loading") - - - self.progress_cpu = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=20), - TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - self.task_cpu_used = self.progress_cpu.add_task(completed=0, description=f"[white]CPU used TOTAL ", total=100, status="Loading") - self.task_cpu_used_system = self.progress_cpu.add_task(completed=0, description=f"[white]CPU used SYS ", total=100, status="Loading") - self.task_cpu_used_user = self.progress_cpu.add_task(completed=0, description=f"[white]CPU used USER ", total=100, status="Loading") - - self.progress_extra = Progress(TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=16), - # TaskProgressColumn(), - TextColumn("{task.fields[status]}"), - ) - self.task_extra_uptime = self.progress_extra.add_task(completed=0, description=f"[white]UP Time ", total=100, status="Loading") - start_time_json = self.podStartTime(pod=pod, namespace=namespace, container=container) - if start_time_json.get('success'): - start_time = helper_.convert_epoch_timestamp(start_time_json.get('result')) - else: - start_time = start_time_json.get('fail_reason') - self.task_extra_start_time = self.progress_extra.add_task(completed=0, description=f"[white]Start Time ", total=100, status=start_time) - self.task_extra_file_discriptors = self.progress_extra.add_task(completed=0, description=f"[white]File Descriptors ", total=100, status="Loading") - self.task_extra_threads = self.progress_extra.add_task(completed=0, description=f"[white]Threads ", total=100, status="Loading") - self.task_extra_processes = self.progress_extra.add_task(completed=0, description=f"[white]Processes ", total=100, status="Loading") - - - self.group_memory = Group ( - self.progress_mem_total, - self.progress_mem, - # Rule(style='#AAAAAA'), - # self.progress_swap, - ) - - self.group_cpu = Group ( - self.progress_cpu_load_avg, - Rule(style='#AAAAAA'), - self.progress_cpu, - ) - - def update(self): - time.sleep(3) - while True: - try: - pod_metrics_json = self.podMetrics(pod, node, container, namespace) - pod_mem_metrics_json = pod_metrics_json.get('memory') - pod_cpu_metrics_json = pod_metrics_json.get('cpu') - - ## Update Memory progress bars. - if pod_mem_metrics_json.get('MemLimitBytes').get('success'): - if pod_mem_metrics_json.get('MemLimitBytes').get('result') != 0: - self.progress_mem_total.update(task_id=self.task_mem_total, description=f"[white]Mem Limit ", status=f" {helper_.bytes_to_kb_mb_gb(pod_mem_metrics_json.get('MemLimitBytes').get('result'))}") - self.mem_total_bytes = pod_mem_metrics_json.get('MemLimitBytes').get('result') - else: - self.progress_mem_total.update(task_id=self.task_mem_total, description=f"[white]Mem Limit ", status=f" NO_LIMIT") - else: - self.progress_mem_total.update(task_id=self.task_mem_total, description=f"[white]Mem Limit ", status=pod_mem_metrics_json.get('MemLimitBytes').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - - if pod_mem_metrics_json.get('MemUsageBytes').get('success'): - self.progress_mem.update(task_id=self.task_mem_used, completed=pod_mem_metrics_json.get('MemUsageBytes').get('result'), description=f"[white]Mem used ", total=self.mem_total_bytes, status=f"{helper_.bytes_to_kb_mb_gb(pod_mem_metrics_json.get('MemUsageBytes').get('result'))}") - else: - self.progress_mem.update(task_id=self.task_mem_used, completed=0, description=f"[white]Mem used ", total=100, status=pod_mem_metrics_json.get('MemUsageBytes').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - - if pod_mem_metrics_json.get('MemUsageMaxBytes').get('success'): - self.progress_mem.update(task_id=self.task_mem_used_max, completed=pod_mem_metrics_json.get('MemUsageMaxBytes').get('result'), description=f"[white]Mem used max ", total=self.mem_total_bytes, status=f"{helper_.bytes_to_kb_mb_gb(pod_mem_metrics_json.get('MemUsageMaxBytes').get('result'))}") - else: - self.progress_mem.update(task_id=self.task_mem_used_max, completed=0, description=f"[white]Mem used max ", total=100, status=pod_mem_metrics_json.get('MemUsageMaxBytes').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - - if pod_mem_metrics_json.get('MemCachedBytes').get('success'): - self.progress_mem.update(task_id=self.task_mem_cached, completed=pod_mem_metrics_json.get('MemCachedBytes').get('result'), description=f"[white]Mem cached ", total=self.mem_total_bytes, status=f"{helper_.bytes_to_kb_mb_gb(pod_mem_metrics_json.get('MemCachedBytes').get('result'))}") - else: - self.progress_mem.update(task_id=self.task_mem_cached, completed=0, description=f"[white]Mem cached ", total=100, status=pod_mem_metrics_json.get('MemCachedBytes').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - - - ## Update CPU progress bars. - if pod_cpu_metrics_json.get('cpuLoadAvg10s').get('success'): - self.progress_cpu_load_avg.update(task_id=self.task_cpu_load_avg_10s, description=f"[white]CPU load avg 10s", status=f" {pod_cpu_metrics_json.get('cpuLoadAvg10s').get('result')}") - else: - self.progress_cpu_load_avg.update(task_id=self.task_cpu_load_avg_10s, description=f"[white]CPU load avg 10s", status=pod_cpu_metrics_json.get('cpuLoadAvg10s').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - if pod_cpu_metrics_json.get('cpuQuotaMilicores').get('success'): - self.cpu_limit = pod_cpu_metrics_json.get('cpuQuotaMilicores').get('result') - self.progress_cpu_load_avg.update(task_id=self.task_cpu_limit, description=f"[white]CPU Limit ", total=self.cpu_limit, status=f"{pod_cpu_metrics_json.get('cpuQuotaMilicores').get('result')}m") - else: - self.progress_cpu_load_avg.update(task_id=self.task_cpu_limit, description=f"[white]CPU Limit ", total=self.cpu_limit, status=" NO_LIMIT") - - if pod_cpu_metrics_json.get('cpuLoadAvg10s').get('success'): - self.progress_cpu_load_avg.update(task_id=self.task_cpu_load_avg_10s, description=f"[white]CPU load avg 10s", total=self.cpu_limit, status=f" {pod_cpu_metrics_json.get('cpuLoadAvg10s').get('result')}") - else: - self.progress_cpu_load_avg.update(task_id=self.task_cpu_load_avg_10s, description=f"[white]CPU load avg 10s", total=self.cpu_limit, status=pod_cpu_metrics_json.get('cpuLoadAvg10s').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - - if pod_cpu_metrics_json.get('cpuUsageAVG10mMilicores').get('success'): - self.progress_cpu.update(task_id=self.task_cpu_used, completed=pod_cpu_metrics_json.get('cpuUsageAVG10mMilicores').get('result'), description=f"[white]CPU used TOTAL ", total=self.cpu_limit, status=f"{pod_cpu_metrics_json.get('cpuUsageAVG10mMilicores').get('result')}m") - else: - self.progress_cpu.update(task_id=self.task_cpu_used, completed=0, description=f"[white]CPU used TOTAL ", total=100, status=pod_cpu_metrics_json.get('cpuUsageAVG10mMilicores').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - if pod_cpu_metrics_json.get('cpuUsageSystemAVG10mMilicores').get('success'): - self.progress_cpu.update(task_id=self.task_cpu_used_system, completed=pod_cpu_metrics_json.get('cpuUsageSystemAVG10mMilicores').get('result'), description=f"[white]CPU used SYS ", total=self.task_cpu_limit, status=f"{pod_cpu_metrics_json.get('cpuUsageSystemAVG10mMilicores').get('result')}m") - else: - self.progress_cpu.update(task_id=self.task_cpu_used_system, completed=0, description=f"[white]CPU used SYS ", total=100, status=pod_cpu_metrics_json.get('cpuUsageSystemAVG10mMilicores').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - if pod_cpu_metrics_json.get('cpuUsageUserAVG10mMilicores').get('success'): - self.progress_cpu.update(task_id=self.task_cpu_used_user, completed=pod_cpu_metrics_json.get('cpuUsageUserAVG10mMilicores').get('result'), description=f"[white]CPU used USER ", total=self.task_cpu_limit, status=f"{pod_cpu_metrics_json.get('cpuUsageUserAVG10mMilicores').get('result')}m") - else: - self.progress_cpu.update(task_id=self.task_cpu_used_user, completed=0, description=f"[white]CPU used USER ", total=100, status=pod_cpu_metrics_json.get('cpuUsageUserAVG10mMilicores').get('fail_reason')) - GlobalAttrs.exceptions_num +=1 - - - # Update Extra progress bars - pod_uptime_json = self.podUpTime(pod=pod, container=container, namespace=namespace) - if pod_uptime_json.get('success'): - self.progress_extra.update(task_id=self.task_extra_uptime, completed=0, description=f"[white]UP Time ", total=100, status=helper_.sec_to_m_h_d(pod_uptime_json.get('result'))) - pass - else: - self.progress_extra.update(task_id=self.task_extra_uptime, completed=0, description=f"[white]UP Time ", total=100, status=pod_uptime_json.get('fail_reason')) - - pod_file_descriptors_json = self.podFileDescriptors(pod=pod, container=container, namespace=namespace) - if pod_file_descriptors_json.get('success'): - self.progress_extra.update(task_id=self.task_extra_file_discriptors, completed=0, description=f"[white]File Descriptors ", total=100, status=int(pod_file_descriptors_json.get('result'))) - pass - else: - self.progress_extra.update(task_id=self.task_extra_file_discriptors, completed=0, description=f"[white]File Descriptors ", total=100, status=pod_file_descriptors_json.get('fail_reason')) - - pod_threads_json = self.podThreads(pod=pod, container=container, namespace=namespace) - if pod_threads_json.get('success'): - self.progress_extra.update(task_id=self.task_extra_threads, completed=0, description=f"[white]Threads ", total=100, status=int(pod_threads_json.get('result'))) - else: - self.progress_extra.update(task_id=self.task_extra_threads, completed=0, description=f"[white]Threads ", total=100, status=pod_threads_json.get('fail_reason')) - - pod_processes_json = self.podProcesses(pod=pod, container=container, namespace=namespace) - if pod_processes_json.get('success'): - self.progress_extra.update(task_id=self.task_extra_processes, completed=0, description=f"[white]Processes ", total=100, status=int(pod_processes_json.get('result'))) - else: - self.progress_extra.update(task_id=self.task_extra_processes, completed=0, description=f"[white]Processes ", total=100, status=pod_processes_json.get('fail_reason')) - - - time.sleep(GlobalAttrs.live_update_interval) - except Exception as e: - Logging.log.error("Got an Exception while updating Progress Bars:") - Logging.log.error(e) - Logging.log.exception(traceback.format_stack()) - - def check_thread_node_resources(self, restart=True): - while True: - def thread_status(): - status = "" - if self.thread_node_resources.is_alive(): - status = f"alive [green]✔️" - else: - status = "dead [red]❌" - if restart: - # Restart thread - self.start_threads() - return status - - self.progress_threads_status.update(task_id=self.task_thread_refresh, status=thread_status()) - time.sleep(GlobalAttrs.live_update_interval) - - class ValidatePrometheuesConnection(PrometheusPodsMetrics): - def __init__(self): - super().__init__() - self.result = {} - - def run(self): - while True: - self.result = self.verify_prometheus_connection() - time.sleep(5) - - def check_thread_prometheus_server_connection(self): - while True: - - def thread_status(): - result = self.vlaidate_prometheus_server.result - # if self.thread_check_thread_prometheus_server_connection.is_alive(): - if result.get('connected') is None: - status = f"waiting [green]✔️" - elif result.get('connected'): - status = f"connected [green]✔️" - else: - status = f"{result.get('reason')} [red]❌" - - return status - - self.progress_threads_status.update(task_id=self.task_prometheus_server_connection, status=f"{thread_status()} ({self.vlaidate_prometheus_server.result.get('status_code')})") - time.sleep(5) - - def start_threads(self): - self.thread_node_resources = threading.Thread(target=self.update) - self.thread_node_resources.daemon = True - self.thread_node_resources.start() - - self.vlaidate_prometheus_server = self.ValidatePrometheuesConnection() - self.thread_prometheus_server_connection = threading.Thread(target=self.vlaidate_prometheus_server.run) - self.thread_prometheus_server_connection.daemon = True - self.thread_prometheus_server_connection.start() - - def watch_threads(self): - self.thread_check_thread_node_resources = threading.Thread(target=self.check_thread_node_resources) - self.thread_check_thread_node_resources.daemon = True - self.thread_check_thread_node_resources.start() - - self.thread_check_thread_prometheus_server_connection = threading.Thread(target=self.check_thread_prometheus_server_connection) - self.thread_check_thread_prometheus_server_connection.daemon = True - self.thread_check_thread_prometheus_server_connection.start() - - - - - try: - pod_resources_progress = Pod_Resources_Progress() - - progress_table = Table.grid(expand=True) - progress_table.add_row( - Panel(pod_resources_progress.group_cpu, title="[b]CPU", padding=(1, 2)), - ) - progress_table.add_row( - Panel(pod_resources_progress.group_memory, title="[b]Memory", padding=(1, 2)), - ) - progress_table.add_row( - Panel(pod_resources_progress.progress_extra, title='[b]Extra', padding=(1, 2)), - ) - progress_table.add_row( - Panel(pod_resources_progress.progress_threads_status, title="[b]Threads Status",padding=(1, 2), subtitle=""), - ) - - layout = make_layout() - layout["header"].update(Header()) - layout["body1_a"].update(progress_table) - layout['body1_b'].update(Panel("Made with [red]❤️[/red]", title='[b]Unused Space', padding=(1, 2),)) - - pod_resources_progress.start_threads() - pod_resources_progress.watch_threads() - - update_network_received_bytes_graph = True - network_received_bytes_graph = AsciiGraph() - update_network_transmit_bytes_graph = True - network_transmit_bytes_graph = AsciiGraph() - network_received_bytes = self.podNetworkReceiveBytes(pod, namespace=namespace) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'network_received_bytes' metrics; Result:\n{network_received_bytes}") - else: - Logging.log.info("Getting Pod 'network_received_bytes' metrics") - if network_received_bytes.get('success'): - network_received_bytes_graph.create_graph(network_received_bytes.get('result').keys(), height=6, width=GlobalAttrs.graphs_width -3, format='{:8.0f} kb/s') - network_transmit_bytes_graph.create_graph(network_received_bytes.get('result').keys(), height=6, width=GlobalAttrs.graphs_width -3, format='{:8.0f} kb/s') - - else: - network_received_bytes_graph.graph = network_received_bytes.get('fail_reason') - update_network_received_bytes_graph = False - network_transmit_bytes_graph.graph = network_received_bytes.get('fail_reason') - update_network_transmit_bytes_graph = False - - - # update_network_transmit_bytes_graph = True - # network_transmit_bytes_graph = AsciiGraph() - # network_transmit_bytes = self.podNetworkTransmitBytes(pod, namespace=namespace) - # Logging.log.info("Getting Pod 'network_transmit_bytes' metrics") - # Logging.log.info(network_transmit_bytes) - # if network_transmit_bytes.get('success'): - # network_transmit_bytes_graph.create_graph(network_transmit_bytes.get('result').keys(), height=6, width=42, format='{:8.0f} kb/s') - # else: - # network_transmit_bytes_graph.graph = network_transmit_bytes.get('fail_reason') - # update_network_transmit_bytes_graph = False - - - update_disk_read_bytes_graph = True - update_disk_write_bytes_graph = True - disk_read_bytes_graph = AsciiGraph() - disk_write_bytes_graph = AsciiGraph() - disk_read_bytes = self.podDiskReadBytes(pod=pod, container=container, namespace=namespace) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'disk_read_bytes' metrics; Result:\n{disk_read_bytes}") - else: - Logging.log.info("Getting Pod 'disk_read_bytes' metrics") - if disk_read_bytes.get('success'): - disk_read_bytes_graph.create_graph(disk_read_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width -3, format='{:8.0f} kb/s') - disk_write_bytes_graph.create_graph(disk_read_bytes.get('result').keys(), height=5, width=GlobalAttrs.graphs_width -3, format='{:8.0f} kb/s') - else: - disk_read_bytes_graph.graph = disk_read_bytes.get('fail_reason') - disk_write_bytes_graph.graph = disk_read_bytes.get('fail_reason') - update_disk_read_bytes_graph = False - update_disk_write_bytes_graph = False - - # update_disk_write_bytes_graph = True - # disk_write_bytes_graph = AsciiGraph() - # disk_write_bytes = self.podDiskWriteBytes(pod=pod, container=container, namespace=namespace) - # Logging.log.info("Getting Pod 'disk_write_bytes' metrics") - # Logging.log.info(disk_write_bytes) - # if disk_write_bytes.get('success'): - # disk_write_bytes_graph.create_graph(disk_write_bytes.get('result').keys(), height=5, width=45, format='{:8.0f} kb/s') - # else: - # disk_write_bytes_graph.graph = disk_write_bytes.get('fail_reason') - # update_disk_write_bytes_graph = False - - - layout["body2_b_b"].update(Panel(Markdown("Loading ..."), title="[b]Network IO", padding=(1, 1))) - layout["body2_b_a"].update(Panel(Markdown("Loading ..."), title="[b]Disk IO", padding=(1, 1))) - - update_containers_mem_usage = True - containers_mem_usage_graph = AsciiGraph() - containers_mem_usage_range_graph = AsciiGraph() - containers_mem_usage = self.podMemUsagePerContainers(pod=pod, container=container, namespace=namespace) - if containers_mem_usage.get('success'): - containers_mem_usage_graph.create_graph(containers_mem_usage.get('result').keys(), height=5, width=GlobalAttrs.graphs_width - 3, format='{:8.0f} mb') - containers_mem_usage_range_graph.create_graph(containers_mem_usage.get('result').keys(), height=5, width=GlobalAttrs.graphs_width -3, format='{:8.0f} mb') - else: - containers_mem_usage_graph.graph = containers_mem_usage.get('fail_reason') - update_containers_mem_usage = False - - - layout["body2_a_a"].update(Panel(Markdown("Loading ..."), title="[b]Memory Usage per Containers", padding=(1, 1))) - - group_mem_usage_per_containers = Group( - Markdown("[CURRENT]", justify='center'), - Text.from_ansi(containers_mem_usage_graph.graph + f"\n {containers_mem_usage_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("[LAST 3 HOURS]", justify='center'), - ) - - - update_pod_pvcs_usage = True - pod_pvcs_usage_graph = AsciiGraph() - pod_pvcs = self.podPVC(pod, namespace) - if pod_pvcs.get('success'): - pod_pvcs_usage_graph.create_graph(pod_pvcs.get('result').keys(), height=6, width=GlobalAttrs.graphs_width, format='{:8.0f} mb') - else: - pod_pvcs_usage_graph.graph = f"{bcolors.BOLD + bcolors.WARNING} [ No PVCs found ]{bcolors.ENDC}\n{bcolors.GRAY}{pod_pvcs.get('fail_reason')}" - update_pod_pvcs_usage = False - - layout["body2_a_b"].update(Panel(Markdown("Loading..."), title="[b]PVCs used by the pod", padding=(1, 1))) - - - Logging.log.info("Starting the Layout.") - with Live(layout, auto_refresh=True, screen=True, refresh_per_second=GlobalAttrs.live_update_interval): - while True: - - if update_containers_mem_usage: - containers_mem_usage = self.podMemUsagePerContainers(pod=pod, container=container, namespace=namespace) - for c, value in containers_mem_usage.get('result').items(): - containers_mem_usage_graph.update_lst(c, helper_.bytes_to_mb(value)) - containers_mem_usage_range = self.podMemUsagePerContainers_range(pod=pod, container=c, namespace=namespace) - containers_mem_usage_range_graph.replace_lst(c, containers_mem_usage_range.get('result')) - - group_mem_usage_per_containers = Group( - Markdown("[CURRENT]", justify='center'), - Text.from_ansi(containers_mem_usage_graph.graph + f"\n {containers_mem_usage_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("[LAST 3 HOURS]", justify='center'), - Text.from_ansi(containers_mem_usage_range_graph.graph + f"\n {containers_mem_usage_range_graph.colors_description_str}"), - - ) - layout["body2_a_a"].update(Panel(group_mem_usage_per_containers, title="[b]Memory Usage per Containers", padding=(1, 1))) - - - if update_pod_pvcs_usage: - pod_pvcs = self.podPVC(pod, namespace) - for pvc, value in pod_pvcs.get('result').items(): - pod_pvcs_usage_graph.update_lst(pvc, helper_.bytes_to_mb(value.get('used'))) - - group_pod_pvcs = Group( - Text.from_ansi(pod_pvcs_usage_graph.graph + f"\n {pod_pvcs_usage_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - self.podPVC_table(pod, namespace) - ) - layout["body2_a_b"].update(Panel(group_pod_pvcs, title="[b]PVCs used by the pod", padding=(1, 1))) - - - if update_network_received_bytes_graph: - network_received_bytes = self.podNetworkReceiveBytes(pod, namespace) - if GlobalAttrs.debug: - Logging.log.debug(f"Getting Pod 'network_received_bytes' metrics; Result:\n{network_received_bytes}") - else: - Logging.log.info("Getting Pod 'network_received_bytes' metrics") - for device, value in network_received_bytes.get('result').items(): - network_received_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - if update_network_transmit_bytes_graph: - network_transmit_bytes = self.podNetworkTransmitBytes(pod, namespace) - if GlobalAttrs.debug: - Logging.log.debug(f"Updating Pod 'network_transmit_bytes' metrics; Result:\n{network_transmit_bytes}") - else: - Logging.log.info("Updating Pod 'network_transmit_bytes' metrics") - for device, value in network_transmit_bytes.get('result').items(): - network_transmit_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - group_network_io = Group( - Markdown("Bytes Received", justify='center'), - Text.from_ansi(network_received_bytes_graph.graph + f"\n {network_received_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Transmitted", justify='center'), - Text.from_ansi(network_transmit_bytes_graph.graph + f"\n {network_transmit_bytes_graph.colors_description_str}"), - ) - - layout["body2_b_b"].update(Panel(group_network_io, title="[b]Network IO", padding=(1, 1))) - - - if update_disk_read_bytes_graph: - disk_read_bytes = self.podDiskReadBytes(pod=pod, container=container, namespace=namespace) - if GlobalAttrs.debug: - Logging.log.debug(f"Updating Pod 'disk_read_bytes' metrics; Result:\n{disk_read_bytes}") - else: - Logging.log.info("Updating Pod 'disk_read_bytes' metrics") - for device, value in disk_read_bytes.get('result').items(): - disk_read_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - if update_disk_write_bytes_graph: - disk_write_bytes = self.podDiskWriteBytes(pod=pod, container=container, namespace=namespace) - if GlobalAttrs.debug: - Logging.log.debug(f"Updating Pod 'disk_write_bytes' metrics; Result:\n{disk_write_bytes}") - else: - Logging.log.info("Updating Pod 'disk_write_bytes' metrics") - for device, value in disk_write_bytes.get('result').items(): - disk_write_bytes_graph.update_lst(device, helper_.bytes_to_kb(value)) - - group_disk_io = Group( - Markdown("Bytes Read", justify='center'), - Text.from_ansi(disk_read_bytes_graph.graph + f"\n {disk_read_bytes_graph.colors_description_str}"), - Rule(style='#AAAAAA'), - Markdown("Bytes Write", justify='center'), - Text.from_ansi(disk_write_bytes_graph.graph + f"\n {disk_write_bytes_graph.colors_description_str}"), - ) - layout["body2_b_a"].update(Panel(group_disk_io, title="[b]Disk IO", padding=(1, 1))) - - - - - - time.sleep(GlobalAttrs.live_update_interval) - except Exception as e: - rich.print("\n[yellow]ERROR -- " + str(e)) - rich.print("\n[underline bold]Exception:") - traceback.print_exc() - exit(1) - except KeyboardInterrupt: - print(" ", end="\r") - rich.print("Ok") - if GlobalAttrs.exceptions_num > 0: - print(f"Found {GlobalAttrs.exceptions_num} Exceptions, you can find the errors in the log file: {GlobalAttrs.log_file_path}") - exit(0) - - - - \ No newline at end of file