diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py index 7d7549a..1f1bc4f 100644 --- a/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -218,7 +218,8 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 16 + +LIBPATCH = 35 logger = logging.getLogger(__name__) @@ -229,15 +230,15 @@ def __init__(self, *args): TOPOLOGY_TEMPLATE_DROPDOWNS = [ # type: ignore { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", "definition": "label_values(up,juju_model)", "description": None, "error": None, "hide": 0, - "includeAll": False, + "includeAll": True, "label": "Juju model", - "multi": False, + "multi": True, "name": "juju_model", "query": { "query": "label_values(up,juju_model)", @@ -254,18 +255,18 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "definition": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', "description": None, "error": None, "hide": 0, - "includeAll": False, + "includeAll": True, "label": "Juju model uuid", - "multi": False, + "multi": True, "name": "juju_model_uuid", "query": { - "query": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', + "query": 'label_values(up{juju_model=~"$juju_model"},juju_model_uuid)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -279,18 +280,18 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', "description": None, "error": None, "hide": 0, - "includeAll": False, + "includeAll": True, "label": "Juju application", - "multi": False, + "multi": True, "name": "juju_application", "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid"},juju_application)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -304,18 +305,18 @@ def __init__(self, *args): "useTags": False, }, { - "allValue": None, + "allValue": ".*", "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', "description": None, "error": None, "hide": 0, - "includeAll": False, + "includeAll": True, "label": "Juju unit", - "multi": False, + "multi": True, "name": "juju_unit", "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},juju_unit)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -335,9 +336,9 @@ def __init__(self, *args): "description": None, "error": None, "hide": 0, - "includeAll": False, - "label": None, - "multi": False, + "includeAll": True, + "label": "Prometheus datasource", + "multi": True, "name": "prometheusds", "options": [], "query": "prometheus", @@ -350,9 +351,9 @@ def __init__(self, *args): "description": None, "error": None, "hide": 0, - "includeAll": False, - "label": None, - "multi": False, + "includeAll": True, + "label": "Loki datasource", + "multi": True, "name": "lokids", "options": [], "query": "loki", @@ -366,17 +367,17 @@ def __init__(self, *args): REACTIVE_CONVERTER = { # type: ignore "allValue": None, "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "definition": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', "description": None, "error": None, "hide": 0, - "includeAll": False, + "includeAll": True, "label": "hosts", "multi": True, "name": "host", "options": [], "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', + "query": 'label_values(up{juju_model=~"$juju_model",juju_model_uuid=~"$juju_model_uuid",juju_application=~"$juju_application"},host)', "refId": "StandardVariableQuery", }, "refresh": 1, @@ -524,7 +525,7 @@ def _validate_relation_by_interface_and_direction( relation = charm.meta.relations[relation_name] actual_relation_interface = relation.interface_name - if actual_relation_interface != expected_relation_interface: + if actual_relation_interface and actual_relation_interface != expected_relation_interface: raise RelationInterfaceMismatchError( relation_name, expected_relation_interface, actual_relation_interface ) @@ -581,7 +582,7 @@ def _convert_dashboard_fields(content: str, inject_dropdowns: bool = True) -> st # If no existing template variables exist, just insert our own if "templating" not in dict_content: - dict_content["templating"] = {"list": [d for d in template_dropdowns]} # type: ignore + dict_content["templating"] = {"list": list(template_dropdowns)} # type: ignore else: # Otherwise, set a flag so we can go back later existing_templates = True @@ -608,51 +609,118 @@ def _replace_template_fields( # noqa: C901 If existing datasource variables are present, try to substitute them. """ replacements = {"loki": "${lokids}", "prometheus": "${prometheusds}"} - used_replacements = [] + used_replacements = [] # type: List[str] # If any existing datasources match types we know, or we didn't find # any templating variables at all, template them. if datasources or not existing_templates: - panels = dict_content["panels"] + panels = dict_content.get("panels", {}) + if panels: + dict_content["panels"] = _template_panels( + panels, replacements, used_replacements, existing_templates, datasources + ) - # Go through all of the panels. If they have a datasource set, AND it's one - # that we can convert to ${lokids} or ${prometheusds}, by stripping off the - # ${} templating and comparing the name to the list we built, replace it, - # otherwise, leave it alone. - # - # COS only knows about Prometheus and Loki. - for panel in panels: - if "datasource" not in panel or not panel.get("datasource", ""): - continue - if not existing_templates: - if "loki" in panel.get("datasource"): + # Find panels nested under rows + rows = dict_content.get("rows", {}) + if rows: + for row_idx, row in enumerate(rows): + if "panels" in row.keys(): + rows[row_idx]["panels"] = _template_panels( + row["panels"], + replacements, + used_replacements, + existing_templates, + datasources, + ) + + dict_content["rows"] = rows + + # Finally, go back and pop off the templates we stubbed out + deletions = [] + for tmpl in dict_content["templating"]["list"]: + if tmpl["name"] and tmpl["name"] in used_replacements: + deletions.append(tmpl) + + for d in deletions: + dict_content["templating"]["list"].remove(d) + + return dict_content + + +def _template_panels( + panels: dict, + replacements: dict, + used_replacements: list, + existing_templates: bool, + datasources: dict, +) -> dict: + """Iterate through a `panels` object and template it appropriately.""" + # Go through all the panels. If they have a datasource set, AND it's one + # that we can convert to ${lokids} or ${prometheusds}, by stripping off the + # ${} templating and comparing the name to the list we built, replace it, + # otherwise, leave it alone. + # + for panel in panels: + if "datasource" not in panel or not panel.get("datasource"): + continue + if not existing_templates: + datasource = panel.get("datasource") + if isinstance(datasource, str): + if "loki" in datasource: panel["datasource"] = "${lokids}" + elif "grafana" in datasource: + continue else: panel["datasource"] = "${prometheusds}" + elif isinstance(datasource, dict): + # In dashboards exported by Grafana 9, datasource type is dict + dstype = datasource.get("type", "") + if dstype == "loki": + panel["datasource"]["uid"] = "${lokids}" + elif dstype == "prometheus": + panel["datasource"]["uid"] = "${prometheusds}" + else: + logger.debug("Unrecognized datasource type '%s'; skipping", dstype) + continue else: + logger.error("Unknown datasource format: skipping") + continue + else: + if isinstance(panel["datasource"], str): if panel["datasource"].lower() in replacements.values(): # Already a known template variable continue # Strip out variable characters and maybe braces ds = re.sub(r"(\$|\{|\})", "", panel["datasource"]) + + if ds not in datasources.keys(): + # Unknown, non-templated datasource, potentially a Grafana builtin + continue + replacement = replacements.get(datasources[ds], "") if replacement: used_replacements.append(ds) panel["datasource"] = replacement or panel["datasource"] + elif isinstance(panel["datasource"], dict): + dstype = panel["datasource"].get("type", "") + if panel["datasource"].get("uid", "").lower() in replacements.values(): + # Already a known template variable + continue + # Strip out variable characters and maybe braces + ds = re.sub(r"(\$|\{|\})", "", panel["datasource"].get("uid", "")) - # Put our substitutions back - dict_content["panels"] = panels - - # Finally, go back and pop off the templates we stubbed out - deletions = [] - for tmpl in dict_content["templating"]["list"]: - if tmpl["name"] and tmpl["name"] in used_replacements: - deletions.append(tmpl) - - for d in deletions: - dict_content["templating"]["list"].remove(d) + if ds not in datasources.keys(): + # Unknown, non-templated datasource, potentially a Grafana builtin + continue - return dict_content + replacement = replacements.get(datasources[ds], "") + if replacement: + used_replacements.append(ds) + panel["datasource"]["uid"] = replacement + else: + logger.error("Unknown datasource format: skipping") + continue + return panels def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: @@ -710,7 +778,7 @@ def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: if "panels" not in dict_content.keys(): return json.dumps(dict_content) - # Go through all of the panels and inject topology labels + # Go through all the panels and inject topology labels # Panels may have more than one 'target' where the expressions live, so that must be # accounted for. Additionally, `promql-transform` does not necessarily gracefully handle # expressions with range queries including variables. Exclude these. @@ -722,7 +790,7 @@ def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: # We need to use an index so we can insert the changed element back later for panel_idx, panel in enumerate(panels): - if type(panel) is not dict: + if not isinstance(panel, dict): continue # Use the index to insert it back in the same location @@ -758,13 +826,22 @@ def _modify_panel(panel: dict, topology: dict, transformer: "CosTool") -> dict: # If there's no expression, we don't need to do anything if "expr" not in target.keys(): continue + expr = target["expr"] if "datasource" not in panel.keys(): continue - elif panel["datasource"] not in known_datasources: + + if isinstance(panel["datasource"], str): + if panel["datasource"] not in known_datasources: + continue + querytype = known_datasources[panel["datasource"]] + elif isinstance(panel["datasource"], dict): + if panel["datasource"]["uid"] not in known_datasources: + continue + querytype = known_datasources[panel["datasource"]["uid"]] + else: + logger.error("Unknown datasource format: skipping") continue - querytype = known_datasources[panel["datasource"]] - expr = target["expr"] # Capture all values inside `[]` into a list which we'll iterate over later to # put them back in-order. Then apply the regex again and replace everything with @@ -824,13 +901,12 @@ def _type_convert_stored(obj): """Convert Stored* to their appropriate types, recursively.""" if isinstance(obj, StoredList): return list(map(_type_convert_stored, obj)) - elif isinstance(obj, StoredDict): + if isinstance(obj, StoredDict): rdict = {} # type: Dict[Any, Any] for k in obj.keys(): rdict[k] = _type_convert_stored(obj[k]) return rdict - else: - return obj + return obj class GrafanaDashboardsChanged(EventBase): @@ -879,7 +955,7 @@ def restore(self, snapshot): """Restore grafana source information.""" self.error_message = snapshot["error_message"] self.valid = snapshot["valid"] - self.errors = json.loads(snapshot["errors"]) + self.errors = json.loads(str(snapshot["errors"])) class GrafanaProviderEvents(ObjectEvents): @@ -892,7 +968,7 @@ class GrafanaDashboardProvider(Object): """An API to provide Grafana dashboards to a Grafana charm.""" _stored = StoredState() - on = GrafanaProviderEvents() + on = GrafanaProviderEvents() # pyright: ignore def __init__( self, @@ -924,7 +1000,7 @@ def __init__( If you would like to use relation name other than `grafana-dashboard`, you will need to specify the relation name via the `relation_name` argument when instantiating the :class:`GrafanaDashboardProvider` object. - However, it is strongly advised to keep the the default relation name, + However, it is strongly advised to keep the default relation name, so that people deploying your charm will have a consistent experience with all other charms that provide Grafana dashboards. @@ -970,7 +1046,7 @@ def __init__( self._dashboards_path = dashboards_path # No peer relation bucket we can rely on providers, keep StoredState here, too - self._stored.set_default(dashboard_templates={}) + self._stored.set_default(dashboard_templates={}) # type: ignore self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) @@ -996,7 +1072,7 @@ def add_dashboard(self, content: str, inject_dropdowns: bool = True) -> None: """ # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore encoded_dashboard = _encode_dashboard_content(content) @@ -1017,7 +1093,7 @@ def remove_non_builtin_dashboards(self) -> None: """Remove all dashboards to the relation added via :method:`add_dashboard`.""" # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("prog:"): @@ -1044,7 +1120,7 @@ def _update_all_dashboards_from_dir( # Ensure we do not leave outdated dashboards by removing from stored all # the encoded dashboards that start with "file/". if self._dashboards_path: - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): @@ -1053,7 +1129,7 @@ def _update_all_dashboards_from_dir( # Path.glob uses fnmatch on the backend, which is pretty limited, so use a # custom function for the filter def _is_dashboard(p: Path) -> bool: - return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) for path in filter(_is_dashboard, Path(self._dashboards_path).glob("*")): # path = Path(path) @@ -1098,14 +1174,14 @@ def _reinitialize_dashboard_data(self, inject_dropdowns: bool = True) -> None: e.grafana_dashboards_absolute_path, e.message, ) - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates: Any = self._stored.dashboard_templates # pyright: ignore for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): del stored_dashboard_templates[dashboard_id] self._stored.dashboard_templates = stored_dashboard_templates - # With all of the file-based dashboards cleared out, force a refresh + # With all the file-based dashboards cleared out, force a refresh # of relation data if self._charm.unit.is_leader(): for dashboard_relation in self._charm.model.relations[self._relation_name]: @@ -1119,6 +1195,7 @@ def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> `grafana_dashboaard` relationship is joined """ if self._charm.unit.is_leader(): + self._update_all_dashboards_from_dir() self._upset_dashboards_on_relation(event.relation) def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: @@ -1128,7 +1205,7 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> event: The `RelationChangedEvent` that triggered this handler. """ if self._charm.unit.is_leader(): - data = json.loads(event.relation.data[event.app].get("event", "{}")) + data = json.loads(event.relation.data[event.app].get("event", "{}")) # type: ignore if not data: return @@ -1136,16 +1213,18 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> valid = bool(data.get("valid", True)) errors = data.get("errors", []) if valid and not errors: - self.on.dashboard_status_changed.emit(valid=valid) + self.on.dashboard_status_changed.emit(valid=valid) # pyright: ignore else: - self.on.dashboard_status_changed.emit(valid=valid, errors=errors) + self.on.dashboard_status_changed.emit( # pyright: ignore + valid=valid, errors=errors + ) def _upset_dashboards_on_relation(self, relation: Relation) -> None: """Update the dashboards in the relation data bucket.""" # It's completely ridiculous to add a UUID, but if we don't have some # pseudo-random value, this never makes it across 'juju set-state' stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } @@ -1155,7 +1234,7 @@ def _content_to_dashboard_object(self, content: str, inject_dropdowns: bool = Tr return { "charm": self._charm.meta.name, "content": content, - "juju_topology": self._juju_topology, + "juju_topology": self._juju_topology if inject_dropdowns else {}, "inject_dropdowns": inject_dropdowns, } @@ -1174,13 +1253,13 @@ def _juju_topology(self) -> Dict: @property def dashboard_templates(self) -> List: """Return a list of the known dashboard templates.""" - return [v for v in self._stored.dashboard_templates.values()] + return list(self._stored.dashboard_templates.values()) # type: ignore class GrafanaDashboardConsumer(Object): """A consumer object for working with Grafana Dashboards.""" - on = GrafanaDashboardEvents() + on = GrafanaDashboardEvents() # pyright: ignore _stored = StoredState() def __init__( @@ -1228,7 +1307,7 @@ def __init__( self._relation_name = relation_name self._tranformer = CosTool(self._charm) - self._stored.set_default(dashboards=dict()) + self._stored.set_default(dashboards={}) # type: ignore self.framework.observe( self._charm.on[self._relation_name].relation_changed, @@ -1272,13 +1351,13 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> changes = self._render_dashboards_and_signal_changed(event.relation) if changes: - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def _on_grafana_peer_changed(self, _: RelationChangedEvent) -> None: """Emit dashboard events on peer events so secondary charm data updates.""" if self._charm.unit.is_leader(): return - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def update_dashboards(self, relation: Optional[Relation] = None) -> None: """Re-establish dashboards on one or more relations. @@ -1325,7 +1404,7 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # """ other_app = relation.app - raw_data = relation.data[other_app].get("dashboards", {}) # type: ignore + raw_data = relation.data[other_app].get("dashboards", "") # pyright: ignore if not raw_data: logger.warning( @@ -1340,11 +1419,6 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # # The only piece of data needed on this side of the relations is "templates" templates = data.pop("templates") - # Import only if a charmed operator uses the consumer, we don't impose these - # dependencies on the client - from jinja2 import Template - from jinja2.exceptions import TemplateSyntaxError - # The dashboards are WAY too big since this ultimately calls out to Juju to # set the relation data, and it overflows the maximum argument length for # subprocess, so we have to use b64, annoyingly. @@ -1357,14 +1431,12 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # relation_has_invalid_dashboards = False for _, (fname, template) in enumerate(templates.items()): - decoded_content = None content = None error = None topology = template.get("juju_topology", {}) try: - decoded_content = _decode_dashboard_content(template["content"]) + content = _decode_dashboard_content(template["content"]) inject_dropdowns = template.get("inject_dropdowns", True) - content = Template(decoded_content).render() content = self._manage_dashboard_uid(content, template) content = _convert_dashboard_fields(content, inject_dropdowns) @@ -1379,9 +1451,6 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # error = str(e.msg) logger.warning("Invalid JSON in Grafana dashboard: {}".format(fname)) continue - except TemplateSyntaxError as e: - error = str(e) - relation_has_invalid_dashboards = True # Prepend the relation name and ID to the dashboard ID to avoid clashes with # multiple relations with apps from the same charm, or having dashboards with @@ -1428,28 +1497,27 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # # Dropping dashboards for a relation needs to be signalled return True - else: - stored_data = rendered_dashboards - currently_stored_data = self._get_stored_dashboards(relation.id) - coerced_data = ( - _type_convert_stored(currently_stored_data) if currently_stored_data else {} - ) + stored_data = rendered_dashboards + currently_stored_data = self._get_stored_dashboards(relation.id) - if not coerced_data == stored_data: - stored_dashboards = self.get_peer_data("dashboards") - stored_dashboards[relation.id] = stored_data - self.set_peer_data("dashboards", stored_dashboards) - return True + coerced_data = _type_convert_stored(currently_stored_data) if currently_stored_data else {} + + if not coerced_data == stored_data: + stored_dashboards = self.get_peer_data("dashboards") + stored_dashboards[relation.id] = stored_data + self.set_peer_data("dashboards", stored_dashboards) + return True + return None # type: ignore def _manage_dashboard_uid(self, dashboard: str, template: dict) -> str: """Add an uid to the dashboard if it is not present.""" - dashboard = json.loads(dashboard) + dashboard_dict = json.loads(dashboard) - if not dashboard.get("uid", None) and "dashboard_alt_uid" in template: - dashboard["uid"] = template["dashboard_alt_uid"] + if not dashboard_dict.get("uid", None) and "dashboard_alt_uid" in template: + dashboard_dict["uid"] = template["dashboard_alt_uid"] - return json.dumps(dashboard) + return json.dumps(dashboard_dict) def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: """If an errored dashboard is in stored data, remove it and trigger a deletion.""" @@ -1457,7 +1525,7 @@ def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: stored_dashboards = self.get_peer_data("dashboards") stored_dashboards.pop(str(relation.id)) self.set_peer_data("dashboards", stored_dashboards) - self.on.dashboards_changed.emit() + self.on.dashboards_changed.emit() # pyright: ignore def _to_external_object(self, relation_id, dashboard): return { @@ -1511,9 +1579,8 @@ class GrafanaDashboardAggregator(Object): The :class:`GrafanaDashboardAggregator` object provides a way to collate and aggregate Grafana dashboards from reactive/machine charms and transport them into Charmed Operators, using Juju topology. - For detailed usage instructions, see the documentation for - :module:`lma-proxy-operator`, as this class is intended for use as a + :module:`cos-proxy-operator`, as this class is intended for use as a single point of intersection rather than use in individual charms. Since :class:`GrafanaDashboardAggregator` serves as a bridge between @@ -1525,7 +1592,6 @@ class GrafanaDashboardAggregator(Object): In its most streamlined usage, :class:`GrafanaDashboardAggregator` is integrated in a charmed operator as follows: - self.grafana = GrafanaDashboardAggregator(self) Args: @@ -1541,7 +1607,7 @@ class GrafanaDashboardAggregator(Object): """ _stored = StoredState() - on = GrafanaProviderEvents() + on = GrafanaProviderEvents() # pyright: ignore def __init__( self, @@ -1553,7 +1619,7 @@ def __init__( # Reactive charms may be RPC-ish and not leave reliable data around. Keep # StoredState here - self._stored.set_default( + self._stored.set_default( # type: ignore dashboard_templates={}, id_mappings={}, ) @@ -1595,42 +1661,48 @@ def _upset_dashboards_on_event(self, event: RelationEvent) -> None: return for id in dashboards: - self._stored.dashboard_templates[id] = self._content_to_dashboard_object( + self._stored.dashboard_templates[id] = self._content_to_dashboard_object( # type: ignore dashboards[id], event ) - self._stored.id_mappings[event.app.name] = dashboards + self._stored.id_mappings[event.app.name] = dashboards # type: ignore self._update_remote_grafana(event) def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: """Push dashboards to the downstream Grafana relation.""" # It's still ridiculous to add a UUID here, but needed stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) def remove_dashboards(self, event: RelationBrokenEvent) -> None: """Remove a dashboard if the relation is broken.""" - app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) + app_ids = _type_convert_stored(self._stored.id_mappings.get(event.app.name, "")) # type: ignore + + if not app_ids: + logger.info("Could not look up stored dashboards for %s", event.app.name) # type: ignore + return - del self._stored.id_mappings[event.app.name] + del self._stored.id_mappings[event.app.name] # type: ignore for id in app_ids: - del self._stored.dashboard_templates[id] + del self._stored.dashboard_templates[id] # type: ignore stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), + "templates": _type_convert_stored(self._stored.dashboard_templates), # pyright: ignore "uuid": str(uuid.uuid4()), } - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) + if self._charm.unit.is_leader(): + for grafana_relation in self.model.relations[self._grafana_relation]: + grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) # Yes, this has a fair amount of branching. It's not that complex, though - def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 + def _strip_existing_datasources(self, dash: dict) -> dict: # noqa: C901 """Remove existing reactive charm datasource templating out. This method iterates through *known* places where reactive charms may set @@ -1639,7 +1711,7 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from the Grafana UI. It is not present in earlier Grafana versions, and can be disabled in 5.3.4 and above (optionally). If set, any values present will be substituted on - import. Some reactive charms use this for Prometheus. LMA2 uses dropdown selectors + import. Some reactive charms use this for Prometheus. COS uses dropdown selectors for datasources, and leaving this present results in "default" datasource values which are broken. @@ -1649,78 +1721,110 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 Further properties may be discovered. """ - dash = template["dashboard"] try: if "list" in dash["templating"]: for i in range(len(dash["templating"]["list"])): if ( "datasource" in dash["templating"]["list"][i] - and "Juju" in dash["templating"]["list"][i]["datasource"] + and dash["templating"]["list"][i]["datasource"] is not None ): - dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" + if "Juju" in dash["templating"]["list"][i].get("datasource", ""): + dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" + + # Strip out newly-added 'juju_application' template variables which + # don't line up with our drop-downs + dash_mutable = dash + for i in range(len(dash["templating"]["list"])): if ( "name" in dash["templating"]["list"][i] - and dash["templating"]["list"][i]["name"] == "host" + and dash["templating"]["list"][i].get("name", "") == "app" ): - dash["templating"]["list"][i] = REACTIVE_CONVERTER + del dash_mutable["templating"]["list"][i] + + if dash_mutable: + dash = dash_mutable except KeyError: logger.debug("No existing templating data in dashboard") if "__inputs" in dash: inputs = dash for i in range(len(dash["__inputs"])): - if dash["__inputs"][i]["pluginName"] == "Prometheus": + if dash["__inputs"][i].get("pluginName", "") == "Prometheus": del inputs["__inputs"][i] if inputs: dash["__inputs"] = inputs["__inputs"] else: del dash["__inputs"] - template["dashboard"] = dash - return template + return dash def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" + if not self._charm.unit.is_leader(): + return {} + templates = [] id = "" # Reactive data can reliably be pulled out of events. In theory, if we got an event, # it's on the bucket, but using event explicitly keeps the mental model in # place for reactive - for k in event.relation.data[event.unit].keys(): + for k in event.relation.data[event.unit].keys(): # type: ignore if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) + templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) # type: ignore - for k in event.relation.data[event.app].keys(): + for k in event.relation.data[event.app].keys(): # type: ignore if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) + templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) # type: ignore builtins = self._maybe_get_builtin_dashboards(event) if not templates and not builtins: + logger.warning("NOTHING!") return {} dashboards = {} for t in templates: - # Replace values with LMA-style templating - t = self._strip_existing_datasources(t) - # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON # in the bucket back out to the actual "dashboard" we _need_, this is the way # This is not a mistake -- there's a double nesting in reactive charms, and # Grafana won't load it. We have to unbox: # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], # and the final unboxing is below. - dash = json.dumps(t["dashboard"]) + # + # Apparently SOME newer dashboards (such as Ceph) do not have this double nesting, so + # now we get to account for both :toot: + dash = t.get("dashboard", {}) or t + + # Replace values with LMA-style templating + dash = self._strip_existing_datasources(dash) + dash = json.dumps(dash) # Replace the old-style datasource templates dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) + dash = re.sub( + r'"datasource": "\$datasource"', r'"datasource": "${prometheusds}"', dash + ) + dash = re.sub(r'"uid": "\$datasource"', r'"uid": "${prometheusds}"', dash) + dash = re.sub( + r'"datasource": "(!?\w)[\w|\s|-]+?Juju generated.*?"', + r'"datasource": "${prometheusds}"', + dash, + ) + + # Yank out "new"+old LMA topology + dash = re.sub( + r'(,?\s?juju_application=~)\\"\$app\\"', r'\1\\"$juju_application\\"', dash + ) - from jinja2 import Template + # Replace old piechart panels + dash = re.sub(r'"type": "grafana-piechart-panel"', '"type": "piechart"', dash) + + from jinja2 import DebugUndefined, Template content = _encode_dashboard_content( - Template(dash).render(host=event.unit.name, datasource="prometheus") + Template(dash, undefined=DebugUndefined).render(datasource=r"${prometheusds}") # type: ignore ) id = "prog:{}".format(content[-24:-16]) @@ -1751,12 +1855,12 @@ def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: if dashboards_path: - def _is_dashboard(p: Path) -> bool: - return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + def is_dashboard(p: Path) -> bool: + return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) - for path in filter(_is_dashboard, Path(dashboards_path).glob("*")): + for path in filter(is_dashboard, Path(dashboards_path).glob("*")): # path = Path(path) - if event.app.name in path.name: + if event.app.name in path.name: # type: ignore id = "file:{}".format(path.stem) builtins[id] = self._content_to_dashboard_object( _encode_dashboard_content(path.read_bytes()), event @@ -1766,7 +1870,7 @@ def _is_dashboard(p: Path) -> bool: def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict: return { - "charm": event.app.name, + "charm": event.app.name, # type: ignore "content": content, "juju_topology": self._juju_topology(event), "inject_dropdowns": True, @@ -1779,8 +1883,8 @@ def _juju_topology(self, event: RelationEvent) -> Dict: return { "model": self._charm.model.name, "model_uuid": self._charm.model.uuid, - "application": event.app.name, - "unit": event.unit.name, + "application": event.app.name, # type: ignore + "unit": event.unit.name, # type: ignore } @@ -1884,7 +1988,7 @@ def inject_label_matchers(self, expression: str, topology: dict, type: str) -> s args.extend(["--", "{}".format(expression)]) # noinspection PyBroadException try: - return self._exec(args) + return re.sub(r'="\$juju', r'=~"$juju', self._exec(args)) except subprocess.CalledProcessError as e: logger.debug('Applying the expression failed: "%s", falling back to the original', e) return expression diff --git a/lib/charms/loki_k8s/v0/loki_push_api.py b/lib/charms/loki_k8s/v0/loki_push_api.py index 6a05d5d..16e1294 100644 --- a/lib/charms/loki_k8s/v0/loki_push_api.py +++ b/lib/charms/loki_k8s/v0/loki_push_api.py @@ -12,13 +12,14 @@ implement the provider side of the `loki_push_api` relation interface. For instance, a Loki charm. The provider side of the relation represents the server side, to which logs are being pushed. -- `LokiPushApiConsumer`: This object is meant to be used by any Charmed Operator that needs to -send log to Loki by implementing the consumer side of the `loki_push_api` relation interface. -For instance, a Promtail or Grafana agent charm which needs to send logs to Loki. +- `LokiPushApiConsumer`: Used to obtain the loki api endpoint. This is useful for configuring + applications such as pebble, or charmed operators of workloads such as grafana-agent or promtail, + that can communicate with loki directly. -- `LogProxyConsumer`: This object can be used by any Charmed Operator which needs to -send telemetry, such as logs, to Loki through a Log Proxy by implementing the consumer side of the -`loki_push_api` relation interface. +- `LogProxyConsumer`: DEPRECATED. +This object can be used by any Charmed Operator which needs to send telemetry, such as logs, to +Loki through a Log Proxy by implementing the consumer side of the `loki_push_api` relation +interface. Filtering logs in Loki is largely performed on the basis of labels. In the Juju ecosystem, Juju topology labels are used to uniquely identify the workload which generates telemetry like logs. @@ -32,23 +33,22 @@ This object may be used by any Charmed Operator which implements the `loki_push_api` interface. For instance, Loki or Grafana Agent. -For this purposes a charm needs to instantiate the `LokiPushApiProvider` object with one mandatory +For this purpose a charm needs to instantiate the `LokiPushApiProvider` object with one mandatory and three optional arguments. - `charm`: A reference to the parent (Loki) charm. - `relation_name`: The name of the relation that the charm uses to interact - with its clients, which implement `LokiPushApiConsumer` or `LogProxyConsumer`. + with its clients, which implement `LokiPushApiConsumer` or `LogProxyConsumer` + (note that LogProxyConsumer is deprecated). If provided, this relation name must match a provided relation in metadata.yaml with the `loki_push_api` interface. - Typically `LokiPushApiConsumer` use "logging" as a relation_name and `LogProxyConsumer` use - "log_proxy". + The default relation name is "logging" for `LokiPushApiConsumer` and "log-proxy" for + `LogProxyConsumer` (note that LogProxyConsumer is deprecated). - The default value of this arguments is "logging". - - An example of this in a `metadata.yaml` file should have the following section: + For example, a provider's `metadata.yaml` file may look as follows: ```yaml provides: @@ -56,7 +56,7 @@ interface: loki_push_api ``` - For example, a Loki charm may instantiate the `LokiPushApiProvider` in its constructor as + Subsequently, a Loki charm may instantiate the `LokiPushApiProvider` in its constructor as follows: from charms.loki_k8s.v0.loki_push_api import LokiPushApiProvider @@ -69,21 +69,20 @@ class LokiOperatorCharm(CharmBase): def __init__(self, *args): super().__init__(*args) ... - self._loki_ready() + external_url = urlparse(self._external_url) + self.loki_provider = LokiPushApiProvider( + self, + address=external_url.hostname or self.hostname, + port=external_url.port or 80, + scheme=external_url.scheme, + path=f"{external_url.path}/loki/api/v1/push", + ) ... - def _loki_ready(self): - try: - version = self._loki_server.version - self.loki_provider = LokiPushApiProvider(self) - logger.debug("Loki Provider is available. Loki version: %s", version) - except LokiServerNotReadyError as e: - self.unit.status = MaintenanceStatus(str(e)) - except LokiServerError as e: - self.unit.status = BlockedStatus(str(e)) - - - `port`: Loki Push Api endpoint port. Default value: 3100. - - `rules_dir`: Directory to store alert rules. Default value: "/loki/rules". + - `port`: Loki Push Api endpoint port. Default value: `3100`. + - `scheme`: Loki Push Api endpoint scheme (`HTTP` or `HTTPS`). Default value: `HTTP` + - `address`: Loki Push Api endpoint address. Default value: `localhost` + - `path`: Loki Push Api endpoint path. Default value: `loki/api/v1/push` The `LokiPushApiProvider` object has several responsibilities: @@ -92,7 +91,7 @@ def _loki_ready(self): must be unique to all instances (e.g. using a load balancer). 2. Set the Promtail binary URL (`promtail_binary_zip_url`) so clients that use - `LogProxyConsumer` object can downloaded and configure it. + `LogProxyConsumer` object could download and configure it. 3. Process the metadata of the consumer application, provided via the "metadata" field of the consumer data bag, which are used to annotate the @@ -222,14 +221,17 @@ def __init__(self, *args): ## LogProxyConsumer Library Usage -Let's say that we have a workload charm that produces logs and we need to send those logs to a +> Note: This object is deprecated. Consider migrating to LogForwarder (see v1/loki_push_api) with +> the release of Juju 3.6 LTS. + +Let's say that we have a workload charm that produces logs, and we need to send those logs to a workload implementing the `loki_push_api` interface, such as `Loki` or `Grafana Agent`. Adopting this object in a Charmed Operator consist of two steps: -1. Use the `LogProxyConsumer` class by instanting it in the `__init__` method of the charmed - operator. There are two ways to get logs in to promtail. You can give it a list of files to read - or you can write to it using the syslog protocol. +1. Use the `LogProxyConsumer` class by instantiating it in the `__init__` method of the charmed + operator. There are two ways to get logs in to promtail. You can give it a list of files to + read, or you can write to it using the syslog protocol. For example: @@ -396,7 +398,7 @@ def _promtail_error(self, event): The Loki charm may be related to multiple Loki client charms. Without this, filter rules submitted by one provider charm will also result in corresponding alerts for other -provider charms. Hence every alert rule expression must include such a topology filter stub. +provider charms. Hence, every alert rule expression must include such a topology filter stub. Gathering alert rules and generating rule files within the Loki charm is easily done using the `alerts()` method of `LokiPushApiProvider`. Alerts generated by Loki will automatically @@ -449,18 +451,17 @@ def _alert_rules_error(self, event): import subprocess import tempfile import typing -import uuid from copy import deepcopy from gzip import GzipFile from hashlib import sha256 from io import BytesIO from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union, cast from urllib import request from urllib.error import HTTPError import yaml -from charms.observability_libs.v0.juju_topology import JujuTopology +from cosl import JujuTopology from ops.charm import ( CharmBase, HookEvent, @@ -484,7 +485,9 @@ def _alert_rules_error(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 13 +LIBPATCH = 30 + +PYDEPS = ["cosl"] logger = logging.getLogger(__name__) @@ -608,7 +611,9 @@ def _validate_relation_by_interface_and_direction( actual_relation_interface = relation.interface_name if actual_relation_interface != expected_relation_interface: raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface + relation_name, + expected_relation_interface, + actual_relation_interface, # pyright: ignore ) if expected_relation_role == RelationRole.provides: @@ -766,7 +771,7 @@ def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: # any string as a "wildcard" which the topology labels will # filter down alert_rule["expr"] = self.tool.inject_label_matchers( - re.sub(r"%%juju_topology%%", r'job=".+"', alert_rule["expr"]), + re.sub(r"%%juju_topology%%", r'job=~".+"', alert_rule["expr"]), self.topology.label_matcher_dict, ) @@ -870,20 +875,20 @@ def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: return alert_groups - def add_path(self, path: str, *, recursive: bool = False): + def add_path(self, path_str: str, *, recursive: bool = False): """Add rules from a dir path. All rules from files are aggregated into a data structure representing a single rule file. All group names are augmented with juju topology. Args: - path: either a rules file or a dir of rules files. + path_str: either a rules file or a dir of rules files. recursive: whether to read files recursively or not (no impact if `path` is a file). Raises: InvalidAlertRulePathError: if the provided path is invalid. """ - path = Path(path) # type: Path + path = Path(path_str) # type: Path if path.is_dir(): self.alert_groups.extend(self._from_dir(path, recursive)) elif path.is_file(): @@ -996,6 +1001,8 @@ def __init__(self, handle, relation, relation_id, app=None, unit=None): def snapshot(self) -> Dict: """Save event information.""" + if not self.relation: + return {} snapshot = {"relation_name": self.relation.name, "relation_id": self.relation.id} if self.app: snapshot["app_name"] = self.app.name @@ -1056,7 +1063,7 @@ class LokiPushApiEvents(ObjectEvents): class LokiPushApiProvider(Object): """A LokiPushApiProvider class.""" - on = LokiPushApiEvents() + on = LokiPushApiEvents() # pyright: ignore def __init__( self, @@ -1078,6 +1085,10 @@ def __init__( It is strongly advised not to change the default, so that people deploying your charm will have a consistent experience with all other charms that consume metrics endpoints. + port: an optional port of the Loki service (default is "3100"). + scheme: an optional scheme of the Loki API URL (default is "http"). + address: an optional address of the Loki service (default is "localhost"). + path: an optional path of the Loki API URL (default is "loki/api/v1/push") Raises: RelationNotFoundError: If there is no relation in the charm's metadata.yaml @@ -1146,11 +1157,11 @@ def _on_logging_relation_changed(self, event: HookEvent): event: a `CharmEvent` in response to which the consumer charm must update its relation data. """ - should_update = self._process_logging_relation_changed(event.relation) + should_update = self._process_logging_relation_changed(event.relation) # pyright: ignore if should_update: self.on.loki_push_api_alert_rules_changed.emit( - relation=event.relation, - relation_id=event.relation.id, + relation=event.relation, # pyright: ignore + relation_id=event.relation.id, # pyright: ignore app=self._charm.app, unit=self._charm.unit, ) @@ -1187,7 +1198,7 @@ def _should_update_alert_rules(self, relation) -> bool: """Determine whether alert rules should be regenerated. If there are alert rules in the relation data bag, tell the charm - whether or not to regenerate them based on the boolean returned here. + whether to regenerate them based on the boolean returned here. """ if relation.data.get(relation.app).get("alert_rules", None) is not None: return True @@ -1208,7 +1219,7 @@ def _process_logging_relation_changed(self, relation: Relation) -> bool: relation: the `Relation` instance to update. Returns: - A boolean indicating whether an event should be emitted so we + A boolean indicating whether an event should be emitted, so we only emit one on lifecycle events """ relation.data[self._charm.unit]["public_address"] = socket.getfqdn() or "" @@ -1233,7 +1244,7 @@ def update_endpoint(self, url: str = "", relation: Optional[Relation] = None) -> This method should be used when the charm relying on this library needs to update the relation data in response to something occurring outside - of the `logging` relation lifecycle, e.g., in case of a + the `logging` relation lifecycle, e.g., in case of a host address change because the charmed operator becomes connected to an Ingress after the `logging` relation is established. @@ -1241,15 +1252,20 @@ def update_endpoint(self, url: str = "", relation: Optional[Relation] = None) -> url: An optional url value to update relation data. relation: An optional instance of `class:ops.model.Relation` to update. """ + # if no relation is specified update all of them if not relation: - if not self._charm.model.get_relation(self._relation_name): + if not self._charm.model.relations.get(self._relation_name): return - relation = self._charm.model.get_relation(self._relation_name) + relations_list = self._charm.model.relations.get(self._relation_name) + else: + relations_list = [relation] endpoint = self._endpoint(url or self._url) - relation.data[self._charm.unit].update({"endpoint": json.dumps(endpoint)}) + for relation in relations_list: + relation.data[self._charm.unit].update({"endpoint": json.dumps(endpoint)}) + logger.debug("Saved endpoint in unit relation data") @property @@ -1282,7 +1298,7 @@ def alerts(self) -> dict: # noqa: C901 separate alert rules file for each relation since the returned list of alert groups are indexed by relation ID. Also for each relation ID associated scrape metadata such as Juju model, UUID and application - name are provided so the a unique name may be generated for the rules + name are provided so a unique name may be generated for the rules file. For each relation the structure of data returned is a dictionary with four keys @@ -1306,59 +1322,135 @@ def alerts(self) -> dict: # noqa: C901 """ alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files for relation in self._charm.model.relations[self._relation_name]: - if not relation.units: + if not relation.units or not relation.app: continue alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) if not alert_rules: continue - errors = [] - try: - # NOTE: this `metadata` key SHOULD NOT be changed to `scrape_metadata` - # to align with Prometheus without careful consideration' - metadata = json.loads(relation.data[relation.app]["metadata"]) - identifier = JujuTopology.from_dict(metadata).identifier - labeled_alerts = self._tool.apply_label_matchers(alert_rules) - - _, errmsg = self._tool.validate_alert_rules(alert_rules) - if errmsg: - errors.append(errmsg) - continue + alert_rules = self._inject_alert_expr_labels(alert_rules) + + identifier, topology = self._get_identifier_by_alert_rules(alert_rules) + if not topology: + try: + metadata = json.loads(relation.data[relation.app]["metadata"]) + identifier = JujuTopology.from_dict(metadata).identifier + alerts[identifier] = self._tool.apply_label_matchers(alert_rules) # type: ignore + + except KeyError as e: + logger.debug( + "Relation %s has no 'metadata': %s", + relation.id, + e, + ) - alerts[identifier] = labeled_alerts - except KeyError as e: - logger.warning( - "Relation %s has no 'metadata': %s", - relation.id, - e, + if not identifier: + logger.error( + "Alert rules were found but no usable group or identifier was present." ) + continue - if "groups" not in alert_rules: - logger.warning("No alert groups were found in relation data") - continue - # Construct an ID based on what's in the alert rules - for group in alert_rules["groups"]: + _, errmsg = self._tool.validate_alert_rules(alert_rules) + if errmsg: + relation.data[self._charm.app]["event"] = json.dumps({"errors": errmsg}) + continue + + alerts[identifier] = alert_rules + + return alerts + + def _get_identifier_by_alert_rules( + self, rules: dict + ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: + """Determine an appropriate dict key for alert rules. + + The key is used as the filename when writing alerts to disk, so the structure + and uniqueness is important. + + Args: + rules: a dict of alert rules + Returns: + A tuple containing an identifier, if found, and a JujuTopology, if it could + be constructed. + """ + if "groups" not in rules: + logger.debug("No alert groups were found in relation data") + return None, None + + # Construct an ID based on what's in the alert rules if they have labels + for group in rules["groups"]: + try: + labels = group["rules"][0]["labels"] + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + return topology.identifier, topology + except KeyError: + logger.debug("Alert rules were found but no usable labels were present") + continue + + logger.warning( + "No labeled alert rules were found, and no 'scrape_metadata' " + "was available. Using the alert group name as filename." + ) + try: + for group in rules["groups"]: + return group["name"], None + except KeyError: + logger.debug("No group name was found to use as identifier") + + return None, None + + def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: + """Iterate through alert rules and inject topology into expressions. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + return rules + + modified_groups = [] + for group in rules["groups"]: + # Copy off rules, so we don't modify an object we're iterating over + rules_copy = group["rules"] + for idx, rule in enumerate(rules_copy): + labels = rule.get("labels") + + if labels: try: - labels = group["rules"][0]["labels"] - identifier = "{}_{}_{}".format( - labels["juju_model"], - labels["juju_model_uuid"], - labels["juju_application"], + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), ) - _, errmsg = self._tool.validate_alert_rules(alert_rules) - if errmsg: - errors.append(errmsg) - continue - - alerts[identifier] = alert_rules + # Inject topology and put it back in the list + rule["expr"] = self._tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), + topology.label_matcher_dict, + ) except KeyError: - logger.error("Alert rules were found but no usable labels were present") - if errors: - relation.data[self._charm.app]["event"] = json.dumps({"errors": "; ".join(errors)}) + # Some required JujuTopology key is missing. Just move on. + pass - return alerts + group["rules"][idx] = rule + + modified_groups.append(group) + + rules["groups"] = modified_groups + return rules class ConsumerBase(Object): @@ -1370,6 +1462,7 @@ def __init__( relation_name: str = DEFAULT_RELATION_NAME, alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, recursive: bool = False, + skip_alert_topology_labeling: bool = False, ): super().__init__(charm, relation_name) self._charm = charm @@ -1385,6 +1478,7 @@ def __init__( e.message, ) self._alert_rules_path = alert_rules_path + self._skip_alert_topology_labeling = skip_alert_topology_labeling self._recursive = recursive @@ -1392,7 +1486,9 @@ def _handle_alert_rules(self, relation): if not self._charm.unit.is_leader(): return - alert_rules = AlertRules(self.topology) + alert_rules = ( + AlertRules(None) if self._skip_alert_topology_labeling else AlertRules(self.topology) + ) alert_rules.add_path(self._alert_rules_path, recursive=self._recursive) alert_rules_as_dict = alert_rules.as_dict() @@ -1432,7 +1528,7 @@ def loki_endpoints(self) -> List[dict]: class LokiPushApiConsumer(ConsumerBase): """Loki Consumer class.""" - on = LokiPushApiEvents() + on = LokiPushApiEvents() # pyright: ignore def __init__( self, @@ -1440,27 +1536,32 @@ def __init__( relation_name: str = DEFAULT_RELATION_NAME, alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, recursive: bool = True, + skip_alert_topology_labeling: bool = False, ): """Construct a Loki charm client. - The `LokiPushApiConsumer` object provides configurations to a Loki client charm. - A charm instantiating this object needs Loki information, for instance the - Loki API endpoint to push logs. - The `LokiPushApiConsumer` can be instantiated as follows: + The `LokiPushApiConsumer` object provides configurations to a Loki client charm, such as + the Loki API endpoint to push logs. It is intended for workloads that can speak + loki_push_api (https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki), such + as grafana-agent. + (If you need to forward workload stdout logs, then use v1/loki_push_api.LogForwarder; if + you need to forward log files, then use LogProxyConsumer.) + + `LokiPushApiConsumer` can be instantiated as follows: self._loki_consumer = LokiPushApiConsumer(self) Args: charm: a `CharmBase` object that manages this `LokiPushApiConsumer` object. - Typically this is `self` in the instantiating class. + Typically, this is `self` in the instantiating class. relation_name: the string name of the relation interface to look up. If `charm` has exactly one relation with this interface, the relation's name is returned. If none or multiple relations with the provided interface - are found, this method will raise either an exception of type - NoRelationWithInterfaceFoundError or MultipleRelationsWithInterfaceFoundError, - respectively. + are found, this method will raise either a NoRelationWithInterfaceFoundError or + MultipleRelationsWithInterfaceFoundError exception, respectively. alert_rules_path: a string indicating a path where alert rules can be found - recursive: Whether or not to scan for rule files recursively. + recursive: Whether to scan for rule files recursively. + skip_alert_topology_labeling: whether to skip the alert topology labeling. Raises: RelationNotFoundError: If there is no relation in the charm's metadata.yaml @@ -1485,7 +1586,9 @@ def __init__( _validate_relation_by_interface_and_direction( charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires ) - super().__init__(charm, relation_name, alert_rules_path, recursive) + super().__init__( + charm, relation_name, alert_rules_path, recursive, skip_alert_topology_labeling + ) events = self._charm.on[relation_name] self.framework.observe(self._charm.on.upgrade_charm, self._on_lifecycle_event) self.framework.observe(events.relation_joined, self._on_logging_relation_joined) @@ -1631,6 +1734,9 @@ class LogProxyEvents(ObjectEvents): class LogProxyConsumer(ConsumerBase): """LogProxyConsumer class. + > Note: This object is deprecated. Consider migrating to v1/loki_push_api.LogForwarder with the + > release of Juju 3.6 LTS. + The `LogProxyConsumer` object provides a method for attaching `promtail` to a workload in order to generate structured logging data from applications which traditionally log to syslog or do not have native Loki integration. @@ -1645,9 +1751,8 @@ class LogProxyConsumer(ConsumerBase): relation_name: the string name of the relation interface to look up. If `charm` has exactly one relation with this interface, the relation's name is returned. If none or multiple relations with the provided interface - are found, this method will raise either an exception of type - NoRelationWithInterfaceFoundError or MultipleRelationsWithInterfaceFoundError, - respectively. + are found, this method will raise either a NoRelationWithInterfaceFoundError or + MultipleRelationsWithInterfaceFoundError exception, respectively. enable_syslog: Whether to enable syslog integration. syslog_port: The port syslog is attached to. alert_rules_path: an optional path for the location of alert rules @@ -1670,12 +1775,12 @@ class LogProxyConsumer(ConsumerBase): role. """ - on = LogProxyEvents() + on = LogProxyEvents() # pyright: ignore def __init__( self, charm, - log_files: Optional[list] = None, + log_files: Optional[Union[List[str], str]] = None, relation_name: str = DEFAULT_LOG_PROXY_RELATION_NAME, enable_syslog: bool = False, syslog_port: int = 1514, @@ -1683,19 +1788,30 @@ def __init__( recursive: bool = False, container_name: str = "", promtail_resource_name: Optional[str] = None, + *, # TODO: In v1, move the star up so everything after 'charm' is a kwarg + insecure_skip_verify: bool = False, ): super().__init__(charm, relation_name, alert_rules_path, recursive) self._charm = charm self._relation_name = relation_name self._container = self._get_container(container_name) self._container_name = self._get_container_name(container_name) - self._log_files = log_files or [] + + if not log_files: + log_files = [] + elif isinstance(log_files, str): + log_files = [log_files] + elif not isinstance(log_files, list) or not all((isinstance(x, str) for x in log_files)): + raise TypeError("The 'log_files' argument must be a list of strings.") + self._log_files = log_files + self._syslog_port = syslog_port self._is_syslog = enable_syslog self.topology = JujuTopology.from_charm(charm) self._promtail_resource_name = promtail_resource_name or "promtail-bin" + self.insecure_skip_verify = insecure_skip_verify - # architechure used for promtail binary + # architecture used for promtail binary arch = platform.processor() self._arch = "amd64" if arch == "x86_64" else arch @@ -1784,7 +1900,7 @@ def _on_relation_departed(self, _: RelationEvent) -> None: self._container.stop(WORKLOAD_SERVICE_NAME) self.on.log_proxy_endpoint_departed.emit() - def _get_container(self, container_name: str = "") -> Container: + def _get_container(self, container_name: str = "") -> Container: # pyright: ignore """Gets a single container by name or using the only container running in the Pod. If there is more than one container in the Pod a `PromtailDigestError` is emitted. @@ -1858,7 +1974,9 @@ def _add_pebble_layer(self, workload_binary_path: str) -> None: } }, } - self._container.add_layer(self._container_name, pebble_layer, combine=True) + self._container.add_layer( + self._container_name, pebble_layer, combine=True # pyright: ignore + ) def _create_directories(self) -> None: """Creates the directories for Promtail binary and config file.""" @@ -1894,7 +2012,13 @@ def _push_binary_to_workload(self, binary_path: str, workload_binary_path: str) workload_binary_path: path in workload container to which promtail binary is pushed. """ with open(binary_path, "rb") as f: - self._container.push(workload_binary_path, f, permissions=0o755, make_dirs=True) + self._container.push( + workload_binary_path, + f, + permissions=0o755, + encoding=None, # pyright: ignore + make_dirs=True, + ) logger.debug("The promtail binary file has been pushed to the workload container.") @property @@ -1912,8 +2036,7 @@ def _promtail_attached_as_resource(self) -> bool: except NameError as e: if "invalid resource name" in str(e): return False - else: - raise + raise def _push_promtail_if_attached(self, workload_binary_path: str) -> bool: """Checks whether Promtail binary is attached to the charm or not. @@ -1954,7 +2077,7 @@ def _promtail_must_be_downloaded(self, promtail_info: dict) -> bool: return False def _sha256sums_matches(self, file_path: str, sha256sum: str) -> bool: - """Checks whether a file's sha256sum matches or not with an specific sha256sum. + """Checks whether a file's sha256sum matches or not with a specific sha256sum. Args: file_path: A string representing the files' patch. @@ -1962,7 +2085,7 @@ def _sha256sums_matches(self, file_path: str, sha256sum: str) -> bool: Returns: a boolean representing whether a file's sha256sum matches or not with - an specific sha256sum. + a specific sha256sum. """ try: with open(file_path, "rb") as f: @@ -2003,7 +2126,24 @@ def _download_and_push_promtail_to_workload(self, promtail_info: dict) -> None: - "zipsha": sha256 sum of zip file of promtail binary - "binsha": sha256 sum of unpacked promtail binary """ - with request.urlopen(promtail_info["url"]) as r: + # Check for Juju proxy variables and fall back to standard ones if not set + # If no Juju proxy variable was set, we set proxies to None to let the ProxyHandler get + # the proxy env variables from the environment + proxies = { + # The ProxyHandler uses only the protocol names as keys + # https://docs.python.org/3/library/urllib.request.html#urllib.request.ProxyHandler + "https": os.environ.get("JUJU_CHARM_HTTPS_PROXY", ""), + "http": os.environ.get("JUJU_CHARM_HTTP_PROXY", ""), + # The ProxyHandler uses `no` for the no_proxy key + # https://github.com/python/cpython/blob/3.12/Lib/urllib/request.py#L2553 + "no": os.environ.get("JUJU_CHARM_NO_PROXY", ""), + } + proxies = {k: v for k, v in proxies.items() if v != ""} or None + + proxy_handler = request.ProxyHandler(proxies) + opener = request.build_opener(proxy_handler) + + with opener.open(promtail_info["url"]) as r: file_bytes = r.read() file_path = os.path.join(BINARY_DIR, promtail_info["filename"] + ".gz") with open(file_path, "wb") as f: @@ -2054,8 +2194,15 @@ def _current_config(self) -> dict: @property def _promtail_config(self) -> dict: - """Generates the config file for Promtail.""" + """Generates the config file for Promtail. + + Reference: https://grafana.com/docs/loki/latest/send-data/promtail/configuration + """ config = {"clients": self._clients_list()} + if self.insecure_skip_verify: + for client in config["clients"]: + client["tls_config"] = {"insecure_skip_verify": True} + config.update(self._server_config()) config.update(self._positions()) config.update(self._scrape_configs()) @@ -2310,11 +2457,9 @@ def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: # expr: up transformed_rules = {"groups": []} # type: ignore for rule in rules["groups"]: - transformed = {"name": str(uuid.uuid4()), "rules": [rule]} - transformed_rules["groups"].append(transformed) + transformed_rules["groups"].append(rule) rule_path.write_text(yaml.dump(transformed_rules)) - args = [str(self.path), "--format", "logql", "validate", str(rule_path)] # noinspection PyBroadException try: diff --git a/lib/charms/observability_libs/v0/juju_topology.py b/lib/charms/observability_libs/v0/juju_topology.py index ef4ec58..a79e5d4 100644 --- a/lib/charms/observability_libs/v0/juju_topology.py +++ b/lib/charms/observability_libs/v0/juju_topology.py @@ -75,7 +75,7 @@ LIBID = "bced1658f20f49d28b88f61f83c2d232" LIBAPI = 0 -LIBPATCH = 3 +LIBPATCH = 6 class InvalidUUIDError(Exception): @@ -87,15 +87,19 @@ def __init__(self, uuid: str): class JujuTopology: - """JujuTopology is used for storing, generating and formatting juju topology information.""" + """JujuTopology is used for storing, generating and formatting juju topology information. + + DEPRECATED: This class is deprecated. Use `pip install cosl` and + `from cosl.juju_topology import JujuTopology` instead. + """ def __init__( self, model: str, model_uuid: str, application: str, - unit: str = None, - charm_name: str = None, + unit: Optional[str] = None, + charm_name: Optional[str] = None, ): """Build a JujuTopology object. @@ -181,7 +185,10 @@ def from_dict(cls, data: dict): ) def as_dict( - self, *, remapped_keys: Dict[str, str] = None, excluded_keys: List[str] = None + self, + *, + remapped_keys: Optional[Dict[str, str]] = None, + excluded_keys: Optional[List[str]] = None, ) -> OrderedDict: """Format the topology information into an ordered dict. diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 70b7f1e..e3d35c6 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -1,6 +1,6 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. -"""Source code can be found on GitHub at canonical/observability-libs/lib/charms/observability_libs. +"""Prometheus Scrape Library. ## Overview @@ -13,12 +13,10 @@ shared between Prometheus charms and any other charm that intends to provide a scrape target for Prometheus. -## Dependencies +## Source code -Using this library requires you to fetch the juju_topology library from -[observability-libs](https://charmhub.io/observability-libs/libraries/juju_topology). - -`charmcraft fetch-lib charms.observability_libs.v0.juju_topology` +Source code can be found on GitHub at: + https://github.com/canonical/prometheus-k8s-operator/tree/main/lib/charms/prometheus_k8s ## Provider Library Usage @@ -116,7 +114,7 @@ def __init__(self, *args): { "targets": ["10.1.32.215:7000", "*:8000"], "labels": { - "some-key": "some-value" + "some_key": "some-value" } } ] @@ -146,7 +144,7 @@ def __init__(self, *args): { "targets": ["*:7000"], "labels": { - "some-key": "some-value" + "some_key": "some-value" } } ] @@ -158,7 +156,7 @@ def __init__(self, *args): { "targets": ["*:8000"], "labels": { - "some-other-key": "some-other-value" + "some_other_key": "some-other-value" } } ] @@ -180,7 +178,7 @@ def __init__(self, *args): - `scrape_timeout` - `proxy_url` - `relabel_configs` -- `metrics_relabel_configs` +- `metric_relabel_configs` - `sample_limit` - `label_limit` - `label_name_length_limit` @@ -255,7 +253,11 @@ def _on_scrape_targets_changed(self, event): - a single rule format, which is a simplified subset of the official format, comprising a single alert rule per file, using the same YAML fields. -The file name must have the `.rule` extension. +The file name must have one of the following extensions: +- `.rule` +- `.rules` +- `.yml` +- `.yaml` An example of the contents of such a file in the custom single rule format is shown below. @@ -333,13 +335,23 @@ def _on_scrape_targets_changed(self, event): import tempfile from collections import defaultdict from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from urllib.parse import urlparse import yaml -from charms.observability_libs.v0.juju_topology import JujuTopology +from cosl import JujuTopology +from cosl.rules import AlertRules from ops.charm import CharmBase, RelationRole -from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents +from ops.framework import ( + BoundEvent, + EventBase, + EventSource, + Object, + ObjectEvents, + StoredDict, + StoredList, + StoredState, +) from ops.model import Relation # The unique Charmhub library identifier, never change it @@ -350,7 +362,9 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 24 +LIBPATCH = 47 + +PYDEPS = ["cosl"] logger = logging.getLogger(__name__) @@ -363,7 +377,7 @@ def _on_scrape_targets_changed(self, event): "scrape_timeout", "proxy_url", "relabel_configs", - "metrics_relabel_configs", + "metric_relabel_configs", "sample_limit", "label_limit", "label_name_length_limit", @@ -371,6 +385,8 @@ def _on_scrape_targets_changed(self, event): "scheme", "basic_auth", "tls_config", + "authorization", + "params", } DEFAULT_JOB = { "metrics_path": "/metrics", @@ -449,7 +465,7 @@ def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]: def expand_wildcard_targets_into_individual_jobs( scrape_jobs: List[dict], hosts: Dict[str, Tuple[str, str]], - topology: JujuTopology = None, + topology: Optional[JujuTopology] = None, ) -> List[dict]: """Extract wildcard hosts from the given scrape_configs list into separate jobs. @@ -505,8 +521,8 @@ def expand_wildcard_targets_into_individual_jobs( # for such a target. Therefore labeling with Juju topology, excluding the # unit name. non_wildcard_static_config["labels"] = { - **non_wildcard_static_config.get("labels", {}), **topology.label_matcher_dict, + **non_wildcard_static_config.get("labels", {}), } non_wildcard_static_configs.append(non_wildcard_static_config) @@ -531,9 +547,9 @@ def expand_wildcard_targets_into_individual_jobs( if topology: # Add topology labels modified_static_config["labels"] = { - **modified_static_config.get("labels", {}), **topology.label_matcher_dict, **{"juju_unit": unit_name}, + **modified_static_config.get("labels", {}), } # Instance relabeling for topology should be last in order. @@ -581,15 +597,22 @@ def render_alertmanager_static_configs(alertmanagers: List[str]): # Create a mapping from paths to netlocs # Group alertmanager targets into a dictionary of lists: # {path: [netloc1, netloc2]} - paths = defaultdict(list) # type: Dict[str, List[str]] + paths = defaultdict(list) # type: Dict[Tuple[str, str], List[str]] for parsed in map(urlparse, sanitized): path = parsed.path or "/" - paths[path].append(parsed.netloc) + paths[(parsed.scheme, path)].append(parsed.netloc) return { "alertmanagers": [ - {"path_prefix": path_prefix, "static_configs": [{"targets": netlocs}]} - for path_prefix, netlocs in paths.items() + { + # For https we still do not render a `tls_config` section because + # certs are expected to be made available by the charm via the + # `update-ca-certificates` mechanism. + "scheme": scheme, + "path_prefix": path_prefix, + "static_configs": [{"targets": netlocs}], + } + for (scheme, path_prefix), netlocs in paths.items() ] } @@ -668,10 +691,39 @@ def restore(self, snapshot): self.errors = snapshot["errors"] +class InvalidScrapeJobEvent(EventBase): + """Event emitted when alert rule files are not valid.""" + + def __init__(self, handle, errors: str = ""): + super().__init__(handle) + self.errors = errors + + def snapshot(self) -> Dict: + """Save error information.""" + return {"errors": self.errors} + + def restore(self, snapshot): + """Restore error information.""" + self.errors = snapshot["errors"] + + class MetricsEndpointProviderEvents(ObjectEvents): """Events raised by :class:`InvalidAlertRuleEvent`s.""" alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + invalid_scrape_job = EventSource(InvalidScrapeJobEvent) + + +def _type_convert_stored(obj): + """Convert Stored* to their appropriate types, recursively.""" + if isinstance(obj, StoredList): + return list(map(_type_convert_stored, obj)) + if isinstance(obj, StoredDict): + rdict = {} # type: Dict[Any, Any] + for k in obj.keys(): + rdict[k] = _type_convert_stored(obj[k]) + return rdict + return obj def _validate_relation_by_interface_and_direction( @@ -713,7 +765,7 @@ def _validate_relation_by_interface_and_direction( actual_relation_interface = relation.interface_name if actual_relation_interface != expected_relation_interface: raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface + relation_name, expected_relation_interface, actual_relation_interface or "None" ) if expected_relation_role == RelationRole.provides: @@ -781,204 +833,6 @@ def _is_single_alert_rule_format(rules_dict: dict) -> bool: return set(rules_dict) >= {"alert", "expr"} -class AlertRules: - """Utility class for amalgamating prometheus alert rule files and injecting juju topology. - - An `AlertRules` object supports aggregating alert rules from files and directories in both - official and single rule file formats using the `add_path()` method. All the alert rules - read are annotated with Juju topology labels and amalgamated into a single data structure - in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be - easily dumped into JSON format and exchanged over relation data. The dictionary can also - be dumped into YAML format and written directly into an alert rules file that is read by - Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, - since Prometheus allows only a single list of alert rule groups per alert rules file. - - The official Prometheus format is a YAML file conforming to the Prometheus documentation - (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). - The custom single rule format is a subsection of the official YAML, having a single alert - rule, effectively "one alert per file". - """ - - # This class uses the following terminology for the various parts of a rule file: - # - alert rules file: the entire groups[] yaml, including the "groups:" key. - # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list - # of dictionaries that have the "name" and "rules" keys. - # - alert group (singular): a single dictionary that has the "name" and "rules" keys. - # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with - # the "alert" and "expr" keys. - # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. - - def __init__(self, topology: Optional[JujuTopology] = None): - """Build and alert rule object. - - Args: - topology: an optional `JujuTopology` instance that is used to annotate all alert rules. - """ - self.topology = topology - self.tool = CosTool(None) - self.alert_groups = [] # type: List[dict] - - def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: - """Read a rules file from path, injecting juju topology. - - Args: - root_path: full path to the root rules folder (used only for generating group name) - file_path: full path to a *.rule file. - - Returns: - A list of dictionaries representing the rules file, if file is valid (the structure is - formed by `yaml.safe_load` of the file); an empty list otherwise. - """ - with file_path.open() as rf: - # Load a list of rules from file then add labels and filters - try: - rule_file = yaml.safe_load(rf) - - except Exception as e: - logger.error("Failed to read alert rules from %s: %s", file_path.name, e) - return [] - - if not rule_file: - logger.warning("Empty rules file: %s", file_path.name) - return [] - if not isinstance(rule_file, dict): - logger.error("Invalid rules file (must be a dict): %s", file_path.name) - return [] - if _is_official_alert_rule_format(rule_file): - alert_groups = rule_file["groups"] - elif _is_single_alert_rule_format(rule_file): - # convert to list of alert groups - # group name is made up from the file name - alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] - else: - # invalid/unsupported - logger.error("Invalid rules file: %s", file_path.name) - return [] - - # update rules with additional metadata - for alert_group in alert_groups: - # update group name with topology and sub-path - alert_group["name"] = self._group_name( - str(root_path), - str(file_path), - alert_group["name"], - ) - - # add "juju_" topology labels - for alert_rule in alert_group["rules"]: - if "labels" not in alert_rule: - alert_rule["labels"] = {} - - if self.topology: - alert_rule["labels"].update(self.topology.label_matcher_dict) - # insert juju topology filters into a prometheus alert rule - alert_rule["expr"] = self.tool.inject_label_matchers( - re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]), - self.topology.label_matcher_dict, - ) - - return alert_groups - - def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: - """Generate group name from path and topology. - - The group name is made up of the relative path between the root dir_path, the file path, - and topology identifier. - - Args: - root_path: path to the root rules dir. - file_path: path to rule file. - group_name: original group name to keep as part of the new augmented group name - - Returns: - New group name, augmented by juju topology and relative path. - """ - rel_path = os.path.relpath(os.path.dirname(file_path), root_path) - rel_path = "" if rel_path == "." else rel_path.replace(os.path.sep, "_") - - # Generate group name: - # - name, from juju topology - # - suffix, from the relative path of the rule file; - group_name_parts = [self.topology.identifier] if self.topology else [] - group_name_parts.extend([rel_path, group_name, "alerts"]) - # filter to remove empty strings - return "_".join(filter(None, group_name_parts)) - - @classmethod - def _multi_suffix_glob( - cls, dir_path: Path, suffixes: List[str], recursive: bool = True - ) -> list: - """Helper function for getting all files in a directory that have a matching suffix. - - Args: - dir_path: path to the directory to glob from. - suffixes: list of suffixes to include in the glob (items should begin with a period). - recursive: a flag indicating whether a glob is recursive (nested) or not. - - Returns: - List of files in `dir_path` that have one of the suffixes specified in `suffixes`. - """ - all_files_in_dir = dir_path.glob("**/*" if recursive else "*") - return list(filter(lambda f: f.is_file() and f.suffix in suffixes, all_files_in_dir)) - - def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: - """Read all rule files in a directory. - - All rules from files for the same directory are loaded into a single - group. The generated name of this group includes juju topology. - By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. - - Args: - dir_path: directory containing *.rule files (alert rules without groups). - recursive: flag indicating whether to scan for rule files recursively. - - Returns: - a list of dictionaries representing prometheus alert rule groups, each dictionary - representing an alert group (structure determined by `yaml.safe_load`). - """ - alert_groups = [] # type: List[dict] - - # Gather all alerts into a list of groups - for file_path in self._multi_suffix_glob(dir_path, [".rule", ".rules"], recursive): - alert_groups_from_file = self._from_file(dir_path, file_path) - if alert_groups_from_file: - logger.debug("Reading alert rule from %s", file_path) - alert_groups.extend(alert_groups_from_file) - - return alert_groups - - def add_path(self, path: str, *, recursive: bool = False) -> None: - """Add rules from a dir path. - - All rules from files are aggregated into a data structure representing a single rule file. - All group names are augmented with juju topology. - - Args: - path: either a rules file or a dir of rules files. - recursive: whether to read files recursively or not (no impact if `path` is a file). - - Returns: - True if path was added else False. - """ - path = Path(path) # type: Path - if path.is_dir(): - self.alert_groups.extend(self._from_dir(path, recursive)) - elif path.is_file(): - self.alert_groups.extend(self._from_file(path.parent, path)) - else: - logger.debug("Alert rules path does not exist: %s", path) - - def as_dict(self) -> dict: - """Return standard alert rules file in dict representation. - - Returns: - a dictionary containing a single list of alert rule groups. - The list of alert rule groups is provided as value of the - "groups" dictionary key. - """ - return {"groups": self.alert_groups} if self.alert_groups else {} - - class TargetsChangedEvent(EventBase): """Event emitted when Prometheus scrape targets change.""" @@ -1004,7 +858,7 @@ class MonitoringEvents(ObjectEvents): class MetricsEndpointConsumer(Object): """A Prometheus based Monitoring service.""" - on = MonitoringEvents() + on = MonitoringEvents() # pyright: ignore def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): """A Prometheus based Monitoring service. @@ -1086,12 +940,24 @@ def jobs(self) -> list: for relation in self._charm.model.relations[self._relation_name]: static_scrape_jobs = self._static_scrape_config(relation) if static_scrape_jobs: - scrape_jobs.extend(static_scrape_jobs) + # Duplicate job names will cause validate_scrape_jobs to fail. + # Therefore we need to dedupe here and after all jobs are collected. + static_scrape_jobs = _dedupe_job_names(static_scrape_jobs) + try: + self._tool.validate_scrape_jobs(static_scrape_jobs) + except subprocess.CalledProcessError as e: + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["scrape_job_errors"] = str(e) + relation.data[self._charm.app]["event"] = json.dumps(data) + else: + scrape_jobs.extend(static_scrape_jobs) scrape_jobs = _dedupe_job_names(scrape_jobs) return scrape_jobs + @property def alerts(self) -> dict: """Fetch alerts for all relations. @@ -1142,37 +1008,48 @@ def alerts(self) -> dict: if not alert_rules: continue - try: - scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) - identifier = JujuTopology.from_dict(scrape_metadata).identifier - alerts[identifier] = self._tool.apply_label_matchers(alert_rules) - - except KeyError as e: - logger.debug( - "Relation %s has no 'scrape_metadata': %s", - relation.id, - e, - ) - identifier = self._get_identifier_by_alert_rules(alert_rules) + alert_rules = self._inject_alert_expr_labels(alert_rules) + + identifier, topology = self._get_identifier_by_alert_rules(alert_rules) + if not topology: + try: + scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) + identifier = JujuTopology.from_dict(scrape_metadata).identifier + + except KeyError as e: + logger.debug( + "Relation %s has no 'scrape_metadata': %s", + relation.id, + e, + ) if not identifier: logger.error( - "Alert rules were found but no usable group or identifier was present" + "Alert rules were found but no usable group or identifier was present." ) continue + # We need to append the relation info to the identifier. This is to allow for cases for there are two + # relations which eventually scrape the same application. Issue #551. + identifier = f"{identifier}_{relation.name}_{relation.id}" + alerts[identifier] = alert_rules _, errmsg = self._tool.validate_alert_rules(alert_rules) if errmsg: if alerts[identifier]: del alerts[identifier] - relation.data[self._charm.app]["event"] = json.dumps({"errors": errmsg}) + if self._charm.unit.is_leader(): + data = json.loads(relation.data[self._charm.app].get("event", "{}")) + data["errors"] = errmsg + relation.data[self._charm.app]["event"] = json.dumps(data) continue return alerts - def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: + def _get_identifier_by_alert_rules( + self, rules: dict + ) -> Tuple[Union[str, None], Union[JujuTopology, None]]: """Determine an appropriate dict key for alert rules. The key is used as the filename when writing alerts to disk, so the structure @@ -1180,21 +1057,28 @@ def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: Args: rules: a dict of alert rules + Returns: + A tuple containing an identifier, if found, and a JujuTopology, if it could + be constructed. """ if "groups" not in rules: logger.debug("No alert groups were found in relation data") - return None + return None, None # Construct an ID based on what's in the alert rules if they have labels for group in rules["groups"]: try: labels = group["rules"][0]["labels"] - identifier = "{}_{}_{}".format( - labels["juju_model"], - labels["juju_model_uuid"], - labels["juju_application"], + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), ) - return identifier + return topology.identifier, topology except KeyError: logger.debug("Alert rules were found but no usable labels were present") continue @@ -1205,11 +1089,55 @@ def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: ) try: for group in rules["groups"]: - return group["name"] + return group["name"], None except KeyError: logger.debug("No group name was found to use as identifier") - return None + return None, None + + def _inject_alert_expr_labels(self, rules: Dict[str, Any]) -> Dict[str, Any]: + """Iterate through alert rules and inject topology into expressions. + + Args: + rules: a dict of alert rules + """ + if "groups" not in rules: + return rules + + modified_groups = [] + for group in rules["groups"]: + # Copy off rules, so we don't modify an object we're iterating over + rules_copy = group["rules"] + for idx, rule in enumerate(rules_copy): + labels = rule.get("labels") + + if labels: + try: + topology = JujuTopology( + # Don't try to safely get required constructor fields. There's already + # a handler for KeyErrors + model_uuid=labels["juju_model_uuid"], + model=labels["juju_model"], + application=labels["juju_application"], + unit=labels.get("juju_unit", ""), + charm_name=labels.get("juju_charm", ""), + ) + + # Inject topology and put it back in the list + rule["expr"] = self._tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), + topology.alert_expression_dict, + ) + except KeyError: + # Some required JujuTopology key is missing. Just move on. + pass + + group["rules"][idx] = rule + + modified_groups.append(group) + + rules["groups"] = modified_groups + return rules def _static_scrape_config(self, relation) -> list: """Generate the static scrape configuration for a single relation. @@ -1230,29 +1158,31 @@ def _static_scrape_config(self, relation) -> list: if not relation.units: return [] - scrape_jobs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) + scrape_configs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) - if not scrape_jobs: + if not scrape_configs: return [] scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) if not scrape_metadata: - return scrape_jobs + return scrape_configs topology = JujuTopology.from_dict(scrape_metadata) job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) - scrape_jobs = PrometheusConfig.prefix_job_names(scrape_jobs, job_name_prefix) - scrape_jobs = PrometheusConfig.sanitize_scrape_configs(scrape_jobs) + scrape_configs = PrometheusConfig.prefix_job_names(scrape_configs, job_name_prefix) + scrape_configs = PrometheusConfig.sanitize_scrape_configs(scrape_configs) hosts = self._relation_hosts(relation) - scrape_jobs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( - scrape_jobs, hosts, topology + scrape_configs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( + scrape_configs, hosts, topology ) - return scrape_jobs + # For https scrape targets we still do not render a `tls_config` section because certs + # are expected to be made available by the charm via the `update-ca-certificates` mechanism. + return scrape_configs def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" @@ -1317,7 +1247,7 @@ def _dedupe_job_names(jobs: List[dict]): job["job_name"] = "{}_{}".format(job["job_name"], hashed) new_jobs = [] for key in jobs_dict: - new_jobs.extend([i for i in jobs_dict[key]]) + new_jobs.extend(list(jobs_dict[key])) # Deduplicate jobs which are equal # Again this in O(n^2) but it should be okay @@ -1368,7 +1298,7 @@ def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> st class MetricsEndpointProvider(Object): """A metrics endpoint for Prometheus.""" - on = MetricsEndpointProviderEvents() + on = MetricsEndpointProviderEvents() # pyright: ignore def __init__( self, @@ -1377,7 +1307,8 @@ def __init__( jobs=None, alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, - external_url: str = None, + external_url: str = "", + lookaside_jobs_callable: Optional[Callable] = None, ): """Construct a metrics provider for a Prometheus charm. @@ -1484,6 +1415,11 @@ def __init__( will be observed to re-set scrape job data (IP address and others) external_url: an optional argument that represents an external url that can be generated by an Ingress or a Proxy. + lookaside_jobs_callable: an optional `Callable` which should be invoked + when the job configuration is built as a secondary mapping. The callable + should return a `List[Dict]` which is syntactically identical to the + `jobs` parameter, but can be updated out of step initialization of + this library without disrupting the 'global' job spec. Raises: RelationNotFoundError: If there is no relation in the charm's metadata.yaml @@ -1523,13 +1459,14 @@ def __init__( external_url if urlparse(external_url).scheme else ("http://" + external_url) ) self.external_url = external_url + self._lookaside_jobs = lookaside_jobs_callable events = self._charm.on[self._relation_name] self.framework.observe(events.relation_changed, self._on_relation_changed) if not refresh_event: # FIXME remove once podspec charms are verified. - # `self._set_scrape_job_spec()` is called every re-init so this should not be needed. + # `self.set_scrape_job_spec()` is called every re-init so this should not be needed. if len(self._charm.meta.containers) == 1: if "kubernetes" in self._charm.meta.series: # This is a podspec charm @@ -1551,19 +1488,9 @@ def __init__( if not isinstance(refresh_event, list): refresh_event = [refresh_event] + self.framework.observe(events.relation_joined, self.set_scrape_job_spec) for ev in refresh_event: - self.framework.observe(ev, self._set_scrape_job_spec) - - # Update relation data every reinit. If instead we used event hooks then observing only - # relation-joined would not be sufficient: - # - Would need to observe leader-elected, in case there was no leader during - # relation-joined. - # - If later related to an ingress provider, then would need to register and wait for - # update-status interval to elapse before changes would apply. - # - The ingerss-ready custom event is currently emitted prematurely and cannot be relied - # upon: https://github.com/canonical/traefik-k8s-operator/issues/78 - # NOTE We may still end up waiting for update-status before changes are applied. - self._set_scrape_job_spec() + self.framework.observe(ev, self.set_scrape_job_spec) def _on_relation_changed(self, event): """Check for alert rule messages in the relation data before moving on.""" @@ -1579,7 +1506,16 @@ def _on_relation_changed(self, event): else: self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) - def _set_scrape_job_spec(self, _=None): + scrape_errors = ev.get("scrape_job_errors", None) + if scrape_errors: + self.on.invalid_scrape_job.emit(errors=scrape_errors) + + def update_scrape_job_spec(self, jobs): + """Update scrape job specification.""" + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + self.set_scrape_job_spec() + + def set_scrape_job_spec(self, _=None): """Ensure scrape target information is made available to prometheus. When a metrics provider charm is related to a prometheus charm, the @@ -1593,7 +1529,7 @@ def _set_scrape_job_spec(self, _=None): if not self._charm.unit.is_leader(): return - alert_rules = AlertRules(topology=self.topology) + alert_rules = AlertRules(query_type="promql", topology=self.topology) alert_rules.add_path(self._alert_rules_path, recursive=True) alert_rules_as_dict = alert_rules.as_dict() @@ -1601,12 +1537,11 @@ def _set_scrape_job_spec(self, _=None): relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) - if alert_rules_as_dict: - # Update relation data with the string representation of the rule file. - # Juju topology is already included in the "scrape_metadata" field above. - # The consumer side of the relation uses this information to name the rules file - # that is written to the filesystem. - relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. + relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) def _set_unit_ip(self, _=None): """Set unit host address. @@ -1664,7 +1599,10 @@ def _scrape_jobs(self) -> list: A list of dictionaries, where each dictionary specifies a single scrape job for Prometheus. """ - return self._jobs if self._jobs else [DEFAULT_JOB] + jobs = self._jobs or [] + if callable(self._lookaside_jobs): + jobs.extend(PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs())) + return jobs or [DEFAULT_JOB] @property def _scrape_metadata(self) -> dict: @@ -1737,7 +1675,7 @@ def _update_relation_data(self, _): if not self._charm.unit.is_leader(): return - alert_rules = AlertRules() + alert_rules = AlertRules(query_type="promql") alert_rules.add_path(self.dir_path, recursive=self._recursive) alert_rules_as_dict = alert_rules.as_dict() @@ -1796,7 +1734,6 @@ class MetricsEndpointAggregator(Object): derive from `MetricsEndpointAggregator` overriding the `_get_targets()` method, which is responsible for aggregating the unit name, host address ("hostname") and port of the scrape target. - `MetricsEndpointAggregator` also assumes that each unit of a scrape target sets in its unit-level relation data a key named "groups". The value of this key is expected to be the string @@ -1831,7 +1768,15 @@ class MetricsEndpointAggregator(Object): constructing an aggregator object. """ - def __init__(self, charm, relation_names, relabel_instance=True): + _stored = StoredState() + + def __init__( + self, + charm, + relation_names: Optional[dict] = None, + relabel_instance=True, + resolve_addresses=False, + ): """Construct a `MetricsEndpointAggregator`. Args: @@ -1847,14 +1792,25 @@ def __init__(self, charm, relation_names, relabel_instance=True): the Prometheus charm. relabel_instance: A boolean flag indicating if Prometheus scrape job "instance" labels must refer to Juju Topology. + resolve_addresses: A boolean flag indiccating if the aggregator + should attempt to perform DNS lookups of targets and append + a `dns_name` label """ - super().__init__(charm, relation_names["prometheus"]) - self._charm = charm - self._target_relation = relation_names["scrape_target"] - self._prometheus_relation = relation_names["prometheus"] - self._alert_rules_relation = relation_names["alert_rules"] + + relation_names = relation_names or {} + + self._prometheus_relation = relation_names.get( + "prometheus", "downstream-prometheus-scrape" + ) + self._target_relation = relation_names.get("scrape_target", "prometheus-target") + self._alert_rules_relation = relation_names.get("alert_rules", "prometheus-rules") + + super().__init__(charm, self._prometheus_relation) + self._stored.set_default(jobs=[], alert_rules=[]) + self._relabel_instance = relabel_instance + self._resolve_addresses = resolve_addresses # manage Prometheus charm relation events prometheus_events = self._charm.on[self._prometheus_relation] @@ -1862,13 +1818,15 @@ def __init__(self, charm, relation_names, relabel_instance=True): # manage list of Prometheus scrape jobs from related scrape targets target_events = self._charm.on[self._target_relation] - self.framework.observe(target_events.relation_changed, self._update_prometheus_jobs) - self.framework.observe(target_events.relation_departed, self._remove_prometheus_jobs) + self.framework.observe(target_events.relation_changed, self._on_prometheus_targets_changed) + self.framework.observe( + target_events.relation_departed, self._on_prometheus_targets_departed + ) # manage alert rules for Prometheus from related scrape targets alert_rule_events = self._charm.on[self._alert_rules_relation] - self.framework.observe(alert_rule_events.relation_changed, self._update_alert_rules) - self.framework.observe(alert_rule_events.relation_departed, self._remove_alert_rules) + self.framework.observe(alert_rule_events.relation_changed, self._on_alert_rules_changed) + self.framework.observe(alert_rule_events.relation_departed, self._on_alert_rules_departed) def _set_prometheus_data(self, event): """Ensure every new Prometheus instances is updated. @@ -1877,59 +1835,63 @@ def _set_prometheus_data(self, event): `MetricsEndpointAggregator`, that Prometheus unit is provided with the complete set of existing scrape jobs and alert rules. """ - jobs = [] # list of scrape jobs, one per relation + if not self._charm.unit.is_leader(): + return + + jobs = [] + _type_convert_stored( + self._stored.jobs # pyright: ignore + ) # list of scrape jobs, one per relation for relation in self.model.relations[self._target_relation]: targets = self._get_targets(relation) if targets and relation.app: jobs.append(self._static_scrape_job(targets, relation.app.name)) - groups = [] # list of alert rule groups, one group per relation + groups = [] + _type_convert_stored( + self._stored.alert_rules # pyright: ignore + ) # list of alert rule groups for relation in self.model.relations[self._alert_rules_relation]: unit_rules = self._get_alert_rules(relation) if unit_rules and relation.app: appname = relation.app.name rules = self._label_alert_rules(unit_rules, appname) - group = {"name": self._group_name(appname), "rules": rules} + group = {"name": self.group_name(appname), "rules": rules} groups.append(group) event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) - def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: + def _on_prometheus_targets_changed(self, event): """Update scrape jobs in response to scrape target changes. When there is any change in relation data with any scrape target, the Prometheus scrape job, for that specific target is - updated. Additionally, if this method is called manually, do the - same. - - Args: - targets: a `dict` containing target information - app_name: a `str` identifying the application + updated. """ - # new scrape job for the relation that has changed - updated_job = self._static_scrape_job(targets, app_name, **kwargs) + targets = self._get_targets(event.relation) + if not targets: + return - for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) - # list of scrape jobs that have not changed - jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] - jobs.append(updated_job) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) + # new scrape job for the relation that has changed + self.set_target_job_data(targets, event.relation.app.name) - def _update_prometheus_jobs(self, event): + def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: """Update scrape jobs in response to scrape target changes. When there is any change in relation data with any scrape target, the Prometheus scrape job, for that specific target is - updated. + updated. Additionally, if this method is called manually, do the + same. + + Args: + targets: a `dict` containing target information + app_name: a `str` identifying the application + kwargs: a `dict` of the extra arguments passed to the function """ - targets = self._get_targets(event.relation) - if not targets: + if not self._charm.unit.is_leader(): return # new scrape job for the relation that has changed - updated_job = self._static_scrape_job(targets, event.relation.app.name) + updated_job = self._static_scrape_job(targets, app_name, **kwargs) for relation in self.model.relations[self._prometheus_relation]: jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) @@ -1938,7 +1900,10 @@ def _update_prometheus_jobs(self, event): jobs.append(updated_job) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - def _remove_prometheus_jobs(self, event): + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore + self._stored.jobs = jobs + + def _on_prometheus_targets_departed(self, event): """Remove scrape jobs when a target departs. Any time a scrape target departs, any Prometheus scrape job @@ -1946,6 +1911,20 @@ def _remove_prometheus_jobs(self, event): """ job_name = self._job_name(event.relation.app.name) unit_name = event.unit.name + self.remove_prometheus_jobs(job_name, unit_name) + + def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): + """Given a job name and unit name, remove scrape jobs associated. + + The `unit_name` parameter is used for automatic, relation data bag-based + generation, where the unit name in labels can be used to ensure that jobs with + similar names (which are generated via the app name when scanning relation data + bags) are not accidentally removed, as their unit name labels will differ. + For NRPE, the job name is calculated from an ID sent via the NRPE relation, and is + sufficient to uniquely identify the target. + """ + if not self._charm.unit.is_leader(): + return for relation in self.model.relations[self._prometheus_relation]: jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) @@ -1973,7 +1952,144 @@ def _remove_prometheus_jobs(self, event): relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - def _update_alert_rules(self, event): + if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore + self._stored.jobs = jobs + + def _job_name(self, appname) -> str: + """Construct a scrape job name. + + Each relation has its own unique scrape job name. All units in + the relation are scraped as part of the same scrape job. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus scrape job name for the application. + """ + return "juju_{}_{}_{}_prometheus_scrape".format( + self.model.name, self.model.uuid[:7], appname + ) + + def _get_targets(self, relation) -> dict: + """Fetch scrape targets for a relation. + + Scrape target information is returned for each unit in the + relation. This information contains the unit name, network + hostname (or address) for that unit, and port on which a + metrics endpoint is exposed in that unit. + + Args: + relation: an `ops.model.Relation` object for which scrape + targets are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is itself + a dictionary of the form + ``` + {"hostname": hostname, "port": port} + ``` + """ + targets = {} + for unit in relation.units: + port = relation.data[unit].get("port", 80) + hostname = relation.data[unit].get("hostname") + if hostname: + targets.update({unit.name: {"hostname": hostname, "port": port}}) + + return targets + + def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: + """Construct a static scrape job for an application. + + Args: + targets: a dictionary providing hostname and port for all + scrape target. The keys of this dictionary are unit + names. Values corresponding to these keys are + themselves a dictionary with keys "hostname" and + "port". + application_name: a string name of the application for + which this static scrape job is being constructed. + kwargs: a `dict` of the extra arguments passed to the function + + Returns: + A dictionary corresponding to a Prometheus static scrape + job configuration for one application. The returned + dictionary may be transformed into YAML and appended to + the list of any existing list of Prometheus static configs. + """ + juju_model = self.model.name + juju_model_uuid = self.model.uuid + + job = { + "job_name": self._job_name(application_name), + "static_configs": [ + { + "targets": ["{}:{}".format(target["hostname"], target["port"])], + "labels": { + "juju_model": juju_model, + "juju_model_uuid": juju_model_uuid, + "juju_application": application_name, + "juju_unit": unit_name, + "host": target["hostname"], + # Expanding this will merge the dicts and replace the + # topology labels if any were present/found + **self._static_config_extra_labels(target), + }, + } + for unit_name, target in targets.items() + ], + "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), + } + job.update(kwargs.get("updates", {})) + + return job + + def _static_config_extra_labels(self, target: Dict[str, str]) -> Dict[str, str]: + """Build a list of extra static config parameters, if specified.""" + extra_info = {} + + if self._resolve_addresses: + try: + dns_name = socket.gethostbyaddr(target["hostname"])[0] + except OSError: + logger.debug("Could not perform DNS lookup for %s", target["hostname"]) + dns_name = target["hostname"] + extra_info["dns_name"] = dns_name + + return extra_info + + @property + def _relabel_configs(self) -> list: + """Create Juju topology relabeling configuration. + + Using Juju topology for instance labels ensures that these + labels are stable across unit recreation. + + Returns: + a list of Prometheus relabeling configurations. Each item in + this list is one relabel configuration. + """ + return ( + [ + { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + ] + if self._relabel_instance + else [] + ) + + def _on_alert_rules_changed(self, event): """Update alert rules in response to scrape target changes. When there is any change in alert rule relation data for any @@ -1984,27 +2100,54 @@ def _update_alert_rules(self, event): if not unit_rules: return - appname = event.relation.app.name - rules = self._label_alert_rules(unit_rules, appname) - # the alert rule group that has changed - updated_group = {"name": self._group_name(appname), "rules": rules} + app_name = event.relation.app.name + self.set_alert_rule_data(app_name, unit_rules) + + def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = True) -> None: + """Update alert rule data. + + The unit rules should be a dict, which is has additional Juju topology labels added. For + rules generated by the NRPE exporter, they are pre-labeled so lookups can be performed. + """ + if not self._charm.unit.is_leader(): + return + + if label_rules: + rules = self._label_alert_rules(unit_rules, name) + else: + rules = [unit_rules] + updated_group = {"name": self.group_name(name), "rules": rules} for relation in self.model.relations[self._prometheus_relation]: alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) groups = alert_rules.get("groups", []) # list of alert rule groups that have not changed - groups = [group for group in groups if updated_group["name"] != group["name"]] - groups.append(updated_group) + for group in groups: + if group["name"] == updated_group["name"]: + group["rules"] = [r for r in group["rules"] if r not in updated_group["rules"]] + group["rules"].extend(updated_group["rules"]) + + if updated_group["name"] not in [g["name"] for g in groups]: + groups.append(updated_group) relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) - def _remove_alert_rules(self, event): + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore + self._stored.alert_rules = groups + + def _on_alert_rules_departed(self, event): """Remove alert rules for departed targets. Any time a scrape target departs any alert rules associated with that specific scrape target is removed. """ - group_name = self._group_name(event.relation.app.name) + group_name = self.group_name(event.relation.app.name) unit_name = event.unit.name + self.remove_alert_rules(group_name, unit_name) + + def remove_alert_rules(self, group_name: str, unit_name: str) -> None: + """Remove an alert rule group from relation data.""" + if not self._charm.unit.is_leader(): + return for relation in self.model.relations[self._prometheus_relation]: alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) @@ -2038,34 +2181,8 @@ def _remove_alert_rules(self, event): json.dumps({"groups": groups}) if groups else "{}" ) - def _get_targets(self, relation) -> dict: - """Fetch scrape targets for a relation. - - Scrape target information is returned for each unit in the - relation. This information contains the unit name, network - hostname (or address) for that unit, and port on which a - metrics endpoint is exposed in that unit. - - Args: - relation: an `ops.model.Relation` object for which scrape - targets are required. - - Returns: - a dictionary whose keys are names of the units in the - relation. There values associated with each key is itself - a dictionary of the form - ``` - {"hostname": hostname, "port": port} - ``` - """ - targets = {} - for unit in relation.units: - port = relation.data[unit].get("port", 80) - hostname = relation.data[unit].get("hostname") - if hostname: - targets.update({unit.name: {"hostname": hostname, "port": port}}) - - return targets + if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore + self._stored.alert_rules = groups def _get_alert_rules(self, relation) -> dict: """Fetch alert rules for a relation. @@ -2093,23 +2210,7 @@ def _get_alert_rules(self, relation) -> dict: return rules - def _job_name(self, appname) -> str: - """Construct a scrape job name. - - Each relation has its own unique scrape job name. All units in - the relation are scraped as part of the same scrape job. - - Args: - appname: string name of a related application. - - Returns: - a string Prometheus scrape job name for the application. - """ - return "juju_{}_{}_{}_prometheus_scrape".format( - self.model.name, self.model.uuid[:7], appname - ) - - def _group_name(self, appname) -> str: + def group_name(self, unit_name: str) -> str: """Construct name for an alert rule group. Each unit in a relation may define its own alert rules. All @@ -2117,20 +2218,21 @@ def _group_name(self, appname) -> str: given a single alert rule group name. Args: - appname: string name of a related application. + unit_name: string name of a related application. Returns: - a string Prometheus alert rules group name for the application. + a string Prometheus alert rules group name for the unit. """ - return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], appname) + unit_name = re.sub(r"/", "_", unit_name) + return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], unit_name) - def _label_alert_rules(self, unit_rules, appname) -> list: + def _label_alert_rules(self, unit_rules, app_name: str) -> list: """Apply juju topology labels to alert rules. Args: unit_rules: a list of alert rules, where each rule is in dictionary format. - appname: a string name of the application to which the + app_name: a string name of the application to which the alert rules belong. Returns: @@ -2142,7 +2244,7 @@ def _label_alert_rules(self, unit_rules, appname) -> list: # the new JujuTopology removed this, so build it up by hand matchers = { "juju_{}".format(k): v - for k, v in JujuTopology(self.model.name, self.model.uuid, appname, unit_name) + for k, v in JujuTopology(self.model.name, self.model.uuid, app_name, unit_name) .as_dict(excluded_keys=["charm_name"]) .items() } @@ -2151,76 +2253,6 @@ def _label_alert_rules(self, unit_rules, appname) -> list: return labeled_rules - def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: - """Construct a static scrape job for an application. - - Args: - targets: a dictionary providing hostname and port for all - scrape target. The keys of this dictionary are unit - names. Values corresponding to these keys are - themselves a dictionary with keys "hostname" and - "port". - application_name: a string name of the application for - which this static scrape job is being constructed. - - Returns: - A dictionary corresponding to a Prometheus static scrape - job configuration for one application. The returned - dictionary may be transformed into YAML and appended to - the list of any existing list of Prometheus static configs. - """ - juju_model = self.model.name - juju_model_uuid = self.model.uuid - job = { - "job_name": self._job_name(application_name), - "static_configs": [ - { - "targets": ["{}:{}".format(target["hostname"], target["port"])], - "labels": { - "juju_model": juju_model, - "juju_model_uuid": juju_model_uuid, - "juju_application": application_name, - "juju_unit": unit_name, - "host": target["hostname"], - }, - } - for unit_name, target in targets.items() - ], - "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), - } - job.update(kwargs.get("updates", {})) - - return job - - @property - def _relabel_configs(self) -> list: - """Create Juju topology relabeling configuration. - - Using Juju topology for instance labels ensures that these - labels are stable across unit recreation. - - Returns: - a list of Prometheus relabeling configurations. Each item in - this list is one relabel configuration. - """ - return ( - [ - { - "source_labels": [ - "juju_model", - "juju_model_uuid", - "juju_application", - "juju_unit", - ], - "separator": "_", - "target_label": "instance", - "regex": "(.*)", - } - ] - if self._relabel_instance - else [] - ) - class CosTool: """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" @@ -2291,6 +2323,22 @@ def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: ] ) + def validate_scrape_jobs(self, jobs: list) -> bool: + """Validate scrape jobs using cos-tool.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating scrape jobs.") + return True + conf = {"scrape_configs": jobs} + with tempfile.NamedTemporaryFile() as tmpfile: + with open(tmpfile.name, "w") as f: + f.write(yaml.safe_dump(conf)) + try: + self._exec([str(self.path), "validate-config", tmpfile.name]) + except subprocess.CalledProcessError as e: + logger.error("Validating scrape jobs failed: {}".format(e.output)) + raise + return True + def inject_label_matchers(self, expression, topology) -> str: """Add label matchers to an expression.""" if not topology: