diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py index c20ab2b..1f1bc4f 100644 --- a/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -219,7 +219,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 32 +LIBPATCH = 35 logger = logging.getLogger(__name__) @@ -665,14 +665,14 @@ def _template_panels( continue if not existing_templates: datasource = panel.get("datasource") - if type(datasource) == str: + if isinstance(datasource, str): if "loki" in datasource: panel["datasource"] = "${lokids}" elif "grafana" in datasource: continue else: panel["datasource"] = "${prometheusds}" - elif type(datasource) == dict: + elif isinstance(datasource, dict): # In dashboards exported by Grafana 9, datasource type is dict dstype = datasource.get("type", "") if dstype == "loki": @@ -686,7 +686,7 @@ def _template_panels( logger.error("Unknown datasource format: skipping") continue else: - if type(panel["datasource"]) == str: + if isinstance(panel["datasource"], str): if panel["datasource"].lower() in replacements.values(): # Already a known template variable continue @@ -701,7 +701,7 @@ def _template_panels( if replacement: used_replacements.append(ds) panel["datasource"] = replacement or panel["datasource"] - elif type(panel["datasource"]) == dict: + elif isinstance(panel["datasource"], dict): dstype = panel["datasource"].get("type", "") if panel["datasource"].get("uid", "").lower() in replacements.values(): # Already a known template variable @@ -790,7 +790,7 @@ def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: # We need to use an index so we can insert the changed element back later for panel_idx, panel in enumerate(panels): - if type(panel) is not dict: + if not isinstance(panel, dict): continue # Use the index to insert it back in the same location @@ -831,11 +831,11 @@ def _modify_panel(panel: dict, topology: dict, transformer: "CosTool") -> dict: if "datasource" not in panel.keys(): continue - if type(panel["datasource"]) == str: + if isinstance(panel["datasource"], str): if panel["datasource"] not in known_datasources: continue querytype = known_datasources[panel["datasource"]] - elif type(panel["datasource"]) == dict: + elif isinstance(panel["datasource"], dict): if panel["datasource"]["uid"] not in known_datasources: continue querytype = known_datasources[panel["datasource"]["uid"]] @@ -1195,6 +1195,7 @@ def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> `grafana_dashboaard` relationship is joined """ if self._charm.unit.is_leader(): + self._update_all_dashboards_from_dir() self._upset_dashboards_on_relation(event.relation) def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: diff --git a/lib/charms/observability_libs/v0/metrics_endpoint_discovery.py b/lib/charms/observability_libs/v0/metrics_endpoint_discovery.py index 45b1981..d77962e 100644 --- a/lib/charms/observability_libs/v0/metrics_endpoint_discovery.py +++ b/lib/charms/observability_libs/v0/metrics_endpoint_discovery.py @@ -47,7 +47,7 @@ def _on_endpoints_change(self, event): from pathlib import Path from typing import Dict, Iterable -from lightkube import Client +from lightkube import Client # pyright: ignore from lightkube.resources.core_v1 import Pod from ops.charm import CharmBase, CharmEvents from ops.framework import EventBase, EventSource, Object, StoredState @@ -62,7 +62,7 @@ def _on_endpoints_change(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 7 # File path where metrics endpoint change data is written for exchange # between the discovery process and the materialised event. @@ -199,7 +199,7 @@ def main(): """ labels, run_cmd, unit, charm_dir = sys.argv[1:] - client = Client() + client = Client() # pyright: ignore labels = json.loads(labels) for change, entity in client.watch(Pod, namespace="*", labels=labels): @@ -208,18 +208,20 @@ def main(): Path(PAYLOAD_FILE_PATH).unlink() meta = entity.metadata metrics_path = "" - if entity.metadata.annotations.get("prometheus.io/path", ""): - metrics_path = entity.metadata.annotations.get("prometheus.io/path", "") + if entity.metadata.annotations.get("prometheus.io/path", ""): # pyright: ignore + metrics_path = entity.metadata.annotations.get( # pyright: ignore + "prometheus.io/path", "" + ) target_ports = [] - for c in filter(lambda c: c.ports is not None, entity.spec.containers): - for p in filter(lambda p: p.name == "metrics", c.ports): + for c in filter(lambda c: c.ports is not None, entity.spec.containers): # pyright: ignore + for p in filter(lambda p: p.name == "metrics", c.ports): # pyright: ignore target_ports.append("*:{}".format(p.containerPort)) payload = { "change": change, - "namespace": meta.namespace, - "name": meta.name, + "namespace": meta.namespace, # pyright: ignore + "name": meta.name, # pyright: ignore "path": metrics_path, "targets": target_ports or ["*:80"], } diff --git a/lib/charms/observability_libs/v1/kubernetes_service_patch.py b/lib/charms/observability_libs/v1/kubernetes_service_patch.py index 64dd13c..2cce729 100644 --- a/lib/charms/observability_libs/v1/kubernetes_service_patch.py +++ b/lib/charms/observability_libs/v1/kubernetes_service_patch.py @@ -127,7 +127,7 @@ def setUp(self, *unused): from types import MethodType from typing import List, Literal, Optional, Union -from lightkube import ApiError, Client +from lightkube import ApiError, Client # pyright: ignore from lightkube.core import exceptions from lightkube.models.core_v1 import ServicePort, ServiceSpec from lightkube.models.meta_v1 import ObjectMeta @@ -146,7 +146,7 @@ def setUp(self, *unused): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 9 ServiceType = Literal["ClusterIP", "LoadBalancer"] @@ -268,7 +268,7 @@ def _patch(self, _) -> None: PatchFailed: if patching fails due to lack of permissions, or otherwise. """ try: - client = Client() + client = Client() # pyright: ignore except exceptions.ConfigError as e: logger.warning("Error creating k8s client: %s", e) return @@ -300,7 +300,7 @@ def is_patched(self) -> bool: Returns: bool: A boolean indicating if the service patch has been applied. """ - client = Client() + client = Client() # pyright: ignore return self._is_patched(client) def _is_patched(self, client: Client) -> bool: @@ -314,7 +314,7 @@ def _is_patched(self, client: Client) -> bool: raise # Construct a list of expected ports, should the patch be applied - expected_ports = [(p.port, p.targetPort) for p in self.service.spec.ports] + expected_ports = [(p.port, p.targetPort) for p in self.service.spec.ports] # type: ignore[attr-defined] # Construct a list in the same manner, using the fetched service fetched_ports = [ (p.port, p.targetPort) for p in service.spec.ports # type: ignore[attr-defined] diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index cac364e..be96768 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -18,13 +18,6 @@ Source code can be found on GitHub at: https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s -## Dependencies - -Using this library requires you to fetch the juju_topology library from -[observability-libs](https://charmhub.io/observability-libs/libraries/juju_topology). - -`charmcraft fetch-lib charms.observability_libs.v0.juju_topology` - ## Provider Library Usage This Prometheus charm interacts with its scrape targets using its @@ -343,12 +336,11 @@ def _on_scrape_targets_changed(self, event): from collections import defaultdict from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from urllib.error import HTTPError, URLError from urllib.parse import urlparse -from urllib.request import urlopen import yaml -from charms.observability_libs.v0.juju_topology import JujuTopology +from cosl import JujuTopology +from cosl.rules import AlertRules from ops.charm import CharmBase, RelationRole from ops.framework import ( BoundEvent, @@ -370,7 +362,9 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 38 +LIBPATCH = 46 + +PYDEPS = ["cosl"] logger = logging.getLogger(__name__) @@ -392,6 +386,7 @@ def _on_scrape_targets_changed(self, event): "basic_auth", "tls_config", "authorization", + "params", } DEFAULT_JOB = { "metrics_path": "/metrics", @@ -526,8 +521,8 @@ def expand_wildcard_targets_into_individual_jobs( # for such a target. Therefore labeling with Juju topology, excluding the # unit name. non_wildcard_static_config["labels"] = { - **non_wildcard_static_config.get("labels", {}), **topology.label_matcher_dict, + **non_wildcard_static_config.get("labels", {}), } non_wildcard_static_configs.append(non_wildcard_static_config) @@ -552,9 +547,9 @@ def expand_wildcard_targets_into_individual_jobs( if topology: # Add topology labels modified_static_config["labels"] = { - **modified_static_config.get("labels", {}), **topology.label_matcher_dict, **{"juju_unit": unit_name}, + **modified_static_config.get("labels", {}), } # Instance relabeling for topology should be last in order. @@ -610,12 +605,12 @@ def render_alertmanager_static_configs(alertmanagers: List[str]): return { "alertmanagers": [ { + # For https we still do not render a `tls_config` section because + # certs are expected to be made available by the charm via the + # `update-ca-certificates` mechanism. "scheme": scheme, "path_prefix": path_prefix, "static_configs": [{"targets": netlocs}], - # FIXME figure out how to get alertmanager's ca_file into here - # Without this, prom errors: "x509: certificate signed by unknown authority" - "tls_config": {"insecure_skip_verify": True}, } for (scheme, path_prefix), netlocs in paths.items() ] @@ -770,7 +765,7 @@ def _validate_relation_by_interface_and_direction( actual_relation_interface = relation.interface_name if actual_relation_interface != expected_relation_interface: raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface + relation_name, expected_relation_interface, actual_relation_interface or "None" ) if expected_relation_role == RelationRole.provides: @@ -838,206 +833,6 @@ def _is_single_alert_rule_format(rules_dict: dict) -> bool: return set(rules_dict) >= {"alert", "expr"} -class AlertRules: - """Utility class for amalgamating prometheus alert rule files and injecting juju topology. - - An `AlertRules` object supports aggregating alert rules from files and directories in both - official and single rule file formats using the `add_path()` method. All the alert rules - read are annotated with Juju topology labels and amalgamated into a single data structure - in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be - easily dumped into JSON format and exchanged over relation data. The dictionary can also - be dumped into YAML format and written directly into an alert rules file that is read by - Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, - since Prometheus allows only a single list of alert rule groups per alert rules file. - - The official Prometheus format is a YAML file conforming to the Prometheus documentation - (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). - The custom single rule format is a subsection of the official YAML, having a single alert - rule, effectively "one alert per file". - """ - - # This class uses the following terminology for the various parts of a rule file: - # - alert rules file: the entire groups[] yaml, including the "groups:" key. - # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list - # of dictionaries that have the "name" and "rules" keys. - # - alert group (singular): a single dictionary that has the "name" and "rules" keys. - # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with - # the "alert" and "expr" keys. - # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. - - def __init__(self, topology: Optional[JujuTopology] = None): - """Build and alert rule object. - - Args: - topology: an optional `JujuTopology` instance that is used to annotate all alert rules. - """ - self.topology = topology - self.tool = CosTool(None) - self.alert_groups = [] # type: List[dict] - - def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: - """Read a rules file from path, injecting juju topology. - - Args: - root_path: full path to the root rules folder (used only for generating group name) - file_path: full path to a *.rule file. - - Returns: - A list of dictionaries representing the rules file, if file is valid (the structure is - formed by `yaml.safe_load` of the file); an empty list otherwise. - """ - with file_path.open() as rf: - # Load a list of rules from file then add labels and filters - try: - rule_file = yaml.safe_load(rf) - - except Exception as e: - logger.error("Failed to read alert rules from %s: %s", file_path.name, e) - return [] - - if not rule_file: - logger.warning("Empty rules file: %s", file_path.name) - return [] - if not isinstance(rule_file, dict): - logger.error("Invalid rules file (must be a dict): %s", file_path.name) - return [] - if _is_official_alert_rule_format(rule_file): - alert_groups = rule_file["groups"] - elif _is_single_alert_rule_format(rule_file): - # convert to list of alert groups - # group name is made up from the file name - alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] - else: - # invalid/unsupported - logger.error("Invalid rules file: %s", file_path.name) - return [] - - # update rules with additional metadata - for alert_group in alert_groups: - # update group name with topology and sub-path - alert_group["name"] = self._group_name( - str(root_path), - str(file_path), - alert_group["name"], - ) - - # add "juju_" topology labels - for alert_rule in alert_group["rules"]: - if "labels" not in alert_rule: - alert_rule["labels"] = {} - - if self.topology: - alert_rule["labels"].update(self.topology.label_matcher_dict) - # insert juju topology filters into a prometheus alert rule - alert_rule["expr"] = self.tool.inject_label_matchers( - re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]), - self.topology.label_matcher_dict, - ) - - return alert_groups - - def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: - """Generate group name from path and topology. - - The group name is made up of the relative path between the root dir_path, the file path, - and topology identifier. - - Args: - root_path: path to the root rules dir. - file_path: path to rule file. - group_name: original group name to keep as part of the new augmented group name - - Returns: - New group name, augmented by juju topology and relative path. - """ - rel_path = os.path.relpath(os.path.dirname(file_path), root_path) - rel_path = "" if rel_path == "." else rel_path.replace(os.path.sep, "_") - - # Generate group name: - # - name, from juju topology - # - suffix, from the relative path of the rule file; - group_name_parts = [self.topology.identifier] if self.topology else [] - group_name_parts.extend([rel_path, group_name, "alerts"]) - # filter to remove empty strings - return "_".join(filter(None, group_name_parts)) - - @classmethod - def _multi_suffix_glob( - cls, dir_path: Path, suffixes: List[str], recursive: bool = True - ) -> list: - """Helper function for getting all files in a directory that have a matching suffix. - - Args: - dir_path: path to the directory to glob from. - suffixes: list of suffixes to include in the glob (items should begin with a period). - recursive: a flag indicating whether a glob is recursive (nested) or not. - - Returns: - List of files in `dir_path` that have one of the suffixes specified in `suffixes`. - """ - all_files_in_dir = dir_path.glob("**/*" if recursive else "*") - return list(filter(lambda f: f.is_file() and f.suffix in suffixes, all_files_in_dir)) - - def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: - """Read all rule files in a directory. - - All rules from files for the same directory are loaded into a single - group. The generated name of this group includes juju topology. - By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. - - Args: - dir_path: directory containing *.rule files (alert rules without groups). - recursive: flag indicating whether to scan for rule files recursively. - - Returns: - a list of dictionaries representing prometheus alert rule groups, each dictionary - representing an alert group (structure determined by `yaml.safe_load`). - """ - alert_groups = [] # type: List[dict] - - # Gather all alerts into a list of groups - for file_path in self._multi_suffix_glob( - dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive - ): - alert_groups_from_file = self._from_file(dir_path, file_path) - if alert_groups_from_file: - logger.debug("Reading alert rule from %s", file_path) - alert_groups.extend(alert_groups_from_file) - - return alert_groups - - def add_path(self, path: str, *, recursive: bool = False) -> None: - """Add rules from a dir path. - - All rules from files are aggregated into a data structure representing a single rule file. - All group names are augmented with juju topology. - - Args: - path: either a rules file or a dir of rules files. - recursive: whether to read files recursively or not (no impact if `path` is a file). - - Returns: - True if path was added else False. - """ - path = Path(path) # type: Path - if path.is_dir(): - self.alert_groups.extend(self._from_dir(path, recursive)) - elif path.is_file(): - self.alert_groups.extend(self._from_file(path.parent, path)) - else: - logger.debug("Alert rules path does not exist: %s", path) - - def as_dict(self) -> dict: - """Return standard alert rules file in dict representation. - - Returns: - a dictionary containing a single list of alert rule groups. - The list of alert rule groups is provided as value of the - "groups" dictionary key. - """ - return {"groups": self.alert_groups} if self.alert_groups else {} - - class TargetsChangedEvent(EventBase): """Event emitted when Prometheus scrape targets change.""" @@ -1063,7 +858,7 @@ class MonitoringEvents(ObjectEvents): class MetricsEndpointConsumer(Object): """A Prometheus based Monitoring service.""" - on = MonitoringEvents() + on = MonitoringEvents() # pyright: ignore def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): """A Prometheus based Monitoring service. @@ -1220,7 +1015,6 @@ def alerts(self) -> dict: try: scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) identifier = JujuTopology.from_dict(scrape_metadata).identifier - alerts[identifier] = self._tool.apply_label_matchers(alert_rules) # type: ignore except KeyError as e: logger.debug( @@ -1235,6 +1029,10 @@ def alerts(self) -> dict: ) continue + # We need to append the relation info to the identifier. This is to allow for cases for there are two + # relations which eventually scrape the same application. Issue #551. + identifier = f"{identifier}_{relation.name}_{relation.id}" + alerts[identifier] = alert_rules _, errmsg = self._tool.validate_alert_rules(alert_rules) @@ -1328,7 +1126,7 @@ def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: # Inject topology and put it back in the list rule["expr"] = self._tool.inject_label_matchers( re.sub(r"%%juju_topology%%,?", "", rule["expr"]), - topology.label_matcher_dict, + topology.alert_expression_dict, ) except KeyError: # Some required JujuTopology key is missing. Just move on. @@ -1382,16 +1180,8 @@ def _static_scrape_config(self, relation) -> list: scrape_configs, hosts, topology ) - # If scheme is https but no ca section present, then auto add "insecure_skip_verify", - # otherwise scraping errors out with "x509: certificate signed by unknown authority". - # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tls_config - for scrape_config in scrape_configs: - tls_config = scrape_config.get("tls_config", {}) - ca_present = "ca" in tls_config or "ca_file" in tls_config - if scrape_config.get("scheme") == "https" and not ca_present: - tls_config["insecure_skip_verify"] = True - scrape_config["tls_config"] = tls_config - + # For https scrape targets we still do not render a `tls_config` section because certs + # are expected to be made available by the charm via the `update-ca-certificates` mechanism. return scrape_configs def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: @@ -1508,7 +1298,7 @@ def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> st class MetricsEndpointProvider(Object): """A metrics endpoint for Prometheus.""" - on = MetricsEndpointProviderEvents() + on = MetricsEndpointProviderEvents() # pyright: ignore def __init__( self, @@ -1739,7 +1529,7 @@ def set_scrape_job_spec(self, _=None): if not self._charm.unit.is_leader(): return - alert_rules = AlertRules(topology=self.topology) + alert_rules = AlertRules(query_type="promql", topology=self.topology) alert_rules.add_path(self._alert_rules_path, recursive=True) alert_rules_as_dict = alert_rules.as_dict() @@ -1747,12 +1537,11 @@ def set_scrape_job_spec(self, _=None): relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) - if alert_rules_as_dict: - # Update relation data with the string representation of the rule file. - # Juju topology is already included in the "scrape_metadata" field above. - # The consumer side of the relation uses this information to name the rules file - # that is written to the filesystem. - relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. + relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) def _set_unit_ip(self, _=None): """Set unit host address. @@ -1886,7 +1675,7 @@ def _update_relation_data(self, _): if not self._charm.unit.is_leader(): return - alert_rules = AlertRules() + alert_rules = AlertRules(query_type="promql") alert_rules.add_path(self.dir_path, recursive=self._recursive) alert_rules_as_dict = alert_rules.as_dict() @@ -2050,14 +1839,16 @@ def _set_prometheus_data(self, event): return jobs = [] + _type_convert_stored( - self._stored.jobs + self._stored.jobs # pyright: ignore ) # list of scrape jobs, one per relation for relation in self.model.relations[self._target_relation]: targets = self._get_targets(relation) if targets and relation.app: jobs.append(self._static_scrape_job(targets, relation.app.name)) - groups = [] + _type_convert_stored(self._stored.alert_rules) # list of alert rule groups + groups = [] + _type_convert_stored( + self._stored.alert_rules # pyright: ignore + ) # list of alert rule groups for relation in self.model.relations[self._alert_rules_relation]: unit_rules = self._get_alert_rules(relation) if unit_rules and relation.app: @@ -2109,7 +1900,7 @@ def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: jobs.append(updated_job) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - if not _type_convert_stored(self._stored.jobs) == jobs: + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs def _on_prometheus_targets_departed(self, event): @@ -2161,7 +1952,7 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - if not _type_convert_stored(self._stored.jobs) == jobs: + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore self._stored.jobs = jobs def _job_name(self, appname) -> str: @@ -2266,16 +2057,7 @@ def _static_config_extra_labels(self, target: Dict[str, str]) -> Dict[str, str]: logger.debug("Could not perform DNS lookup for %s", target["hostname"]) dns_name = target["hostname"] extra_info["dns_name"] = dns_name - label_re = re.compile(r'(?P