diff --git a/charmcraft.yaml b/charmcraft.yaml index 72b0520..5a623bd 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -29,6 +29,10 @@ requires: interface: slurmdbd slurmrestd: interface: slurmrestd +provides: + cos-agent: + interface: cos_agent + limit: 1 assumes: - juju diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py new file mode 100644 index 0000000..870ba62 --- /dev/null +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -0,0 +1,806 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +r"""## Overview. + +This library can be used to manage the cos_agent relation interface: + +- `COSAgentProvider`: Use in machine charms that need to have a workload's metrics + or logs scraped, or forward rule files or dashboards to Prometheus, Loki or Grafana through + the Grafana Agent machine charm. + +- `COSAgentConsumer`: Used in the Grafana Agent machine charm to manage the requirer side of + the `cos_agent` interface. + + +## COSAgentProvider Library Usage + +Grafana Agent machine Charmed Operator interacts with its clients using the cos_agent library. +Charms seeking to send telemetry, must do so using the `COSAgentProvider` object from +this charm library. + +Using the `COSAgentProvider` object only requires instantiating it, +typically in the `__init__` method of your charm (the one which sends telemetry). + +The constructor of `COSAgentProvider` has only one required and nine optional parameters: + +```python + def __init__( + self, + charm: CharmType, + relation_name: str = DEFAULT_RELATION_NAME, + metrics_endpoints: Optional[List[_MetricsEndpointDict]] = None, + metrics_rules_dir: str = "./src/prometheus_alert_rules", + logs_rules_dir: str = "./src/loki_alert_rules", + recurse_rules_dirs: bool = False, + log_slots: Optional[List[str]] = None, + dashboard_dirs: Optional[List[str]] = None, + refresh_events: Optional[List] = None, + scrape_configs: Optional[Union[List[Dict], Callable]] = None, + ): +``` + +### Parameters + +- `charm`: The instance of the charm that instantiates `COSAgentProvider`, typically `self`. + +- `relation_name`: If your charmed operator uses a relation name other than `cos-agent` to use + the `cos_agent` interface, this is where you have to specify that. + +- `metrics_endpoints`: In this parameter you can specify the metrics endpoints that Grafana Agent + machine Charmed Operator will scrape. The configs of this list will be merged with the configs + from `scrape_configs`. + +- `metrics_rules_dir`: The directory in which the Charmed Operator stores its metrics alert rules + files. + +- `logs_rules_dir`: The directory in which the Charmed Operator stores its logs alert rules files. + +- `recurse_rules_dirs`: This parameters set whether Grafana Agent machine Charmed Operator has to + search alert rules files recursively in the previous two directories or not. + +- `log_slots`: Snap slots to connect to for scraping logs in the form ["snap-name:slot", ...]. + +- `dashboard_dirs`: List of directories where the dashboards are stored in the Charmed Operator. + +- `refresh_events`: List of events on which to refresh relation data. + +- `scrape_configs`: List of standard scrape_configs dicts or a callable that returns the list in + case the configs need to be generated dynamically. The contents of this list will be merged + with the configs from `metrics_endpoints`. + + +### Example 1 - Minimal instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... +class TelemetryProviderCharm(CharmBase): + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider(self) +``` + +### Example 2 - Full instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... +class TelemetryProviderCharm(CharmBase): + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider( + self, + relation_name="custom-cos-agent", + metrics_endpoints=[ + # specify "path" and "port" to scrape from localhost + {"path": "/metrics", "port": 9000}, + {"path": "/metrics", "port": 9001}, + {"path": "/metrics", "port": 9002}, + ], + metrics_rules_dir="./src/alert_rules/prometheus", + logs_rules_dir="./src/alert_rules/loki", + recursive_rules_dir=True, + log_slots=["my-app:slot"], + dashboard_dirs=["./src/dashboards_1", "./src/dashboards_2"], + refresh_events=["update-status", "upgrade-charm"], + scrape_configs=[ + { + "job_name": "custom_job", + "metrics_path": "/metrics", + "authorization": {"credentials": "bearer-token"}, + "static_configs": [ + { + "targets": ["localhost:9003"]}, + "labels": {"key": "value"}, + }, + ], + }, + ] + ) +``` + +### Example 3 - Dynamic scrape configs generation: + +Pass a function to the `scrape_configs` to decouple the generation of the configs +from the instantiation of the COSAgentProvider object. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... + +class TelemetryProviderCharm(CharmBase): + def generate_scrape_configs(self): + return [ + { + "job_name": "custom", + "metrics_path": "/metrics", + "static_configs": [{"targets": ["localhost:9000"]}], + }, + ] + + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider( + self, + scrape_configs=self.generate_scrape_configs, + ) +``` + +## COSAgentConsumer Library Usage + +This object may be used by any Charmed Operator which gathers telemetry data by +implementing the consumer side of the `cos_agent` interface. +For instance Grafana Agent machine Charmed Operator. + +For this purpose the charm needs to instantiate the `COSAgentConsumer` object with one mandatory +and two optional arguments. + +### Parameters + +- `charm`: A reference to the parent (Grafana Agent machine) charm. + +- `relation_name`: The name of the relation that the charm uses to interact + with its clients that provides telemetry data using the `COSAgentProvider` object. + + If provided, this relation name must match a provided relation in metadata.yaml with the + `cos_agent` interface. + The default value of this argument is "cos-agent". + +- `refresh_events`: List of events on which to refresh relation data. + + +### Example 1 - Minimal instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentConsumer +... +class GrafanaAgentMachineCharm(GrafanaAgentCharm) + def __init__(self, *args): + ... + self._cos = COSAgentRequirer(self) +``` + + +### Example 2 - Full instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentConsumer +... +class GrafanaAgentMachineCharm(GrafanaAgentCharm) + def __init__(self, *args): + ... + self._cos = COSAgentRequirer( + self, + relation_name="cos-agent-consumer", + refresh_events=["update-status", "upgrade-charm"], + ) +``` +""" + +import json +import logging +from collections import namedtuple +from itertools import chain +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union + +import pydantic +from cosl import GrafanaDashboard, JujuTopology +from cosl.rules import AlertRules +from ops.charm import RelationChangedEvent +from ops.framework import EventBase, EventSource, Object, ObjectEvents +from ops.model import Relation +from ops.testing import CharmType + +if TYPE_CHECKING: + try: + from typing import TypedDict + + class _MetricsEndpointDict(TypedDict): + path: str + port: int + + except ModuleNotFoundError: + _MetricsEndpointDict = Dict # pyright: ignore + +LIBID = "dc15fa84cef84ce58155fb84f6c6213a" +LIBAPI = 0 +LIBPATCH = 8 + +PYDEPS = ["cosl", "pydantic < 2"] + +DEFAULT_RELATION_NAME = "cos-agent" +DEFAULT_PEER_RELATION_NAME = "peers" +DEFAULT_SCRAPE_CONFIG = { + "static_configs": [{"targets": ["localhost:80"]}], + "metrics_path": "/metrics", +} + +logger = logging.getLogger(__name__) +SnapEndpoint = namedtuple("SnapEndpoint", "owner, name") + + +class CosAgentProviderUnitData(pydantic.BaseModel): + """Unit databag model for `cos-agent` relation.""" + + # The following entries are the same for all units of the same principal. + # Note that the same grafana agent subordinate may be related to several apps. + # this needs to make its way to the gagent leader + metrics_alert_rules: dict + log_alert_rules: dict + dashboards: List[GrafanaDashboard] + # subordinate is no longer used but we should keep it until we bump the library to ensure + # we don't break compatibility. + subordinate: Optional[bool] = None + + # The following entries may vary across units of the same principal app. + # this data does not need to be forwarded to the gagent leader + metrics_scrape_jobs: List[Dict] + log_slots: List[str] + + # when this whole datastructure is dumped into a databag, it will be nested under this key. + # while not strictly necessary (we could have it 'flattened out' into the databag), + # this simplifies working with the model. + KEY: ClassVar[str] = "config" + + +class CosAgentPeersUnitData(pydantic.BaseModel): + """Unit databag model for `peers` cos-agent machine charm peer relation.""" + + # We need the principal unit name and relation metadata to be able to render identifiers + # (e.g. topology) on the leader side, after all the data moves into peer data (the grafana + # agent leader can only see its own principal, because it is a subordinate charm). + unit_name: str + relation_id: str + relation_name: str + + # The only data that is forwarded to the leader is data that needs to go into the app databags + # of the outgoing o11y relations. + metrics_alert_rules: Optional[dict] + log_alert_rules: Optional[dict] + dashboards: Optional[List[GrafanaDashboard]] + + # when this whole datastructure is dumped into a databag, it will be nested under this key. + # while not strictly necessary (we could have it 'flattened out' into the databag), + # this simplifies working with the model. + KEY: ClassVar[str] = "config" + + @property + def app_name(self) -> str: + """Parse out the app name from the unit name. + + TODO: Switch to using `model_post_init` when pydantic v2 is released? + https://github.com/pydantic/pydantic/issues/1729#issuecomment-1300576214 + """ + return self.unit_name.split("/")[0] + + +class COSAgentProvider(Object): + """Integration endpoint wrapper for the provider side of the cos_agent interface.""" + + def __init__( + self, + charm: CharmType, + relation_name: str = DEFAULT_RELATION_NAME, + metrics_endpoints: Optional[List["_MetricsEndpointDict"]] = None, + metrics_rules_dir: str = "./src/prometheus_alert_rules", + logs_rules_dir: str = "./src/loki_alert_rules", + recurse_rules_dirs: bool = False, + log_slots: Optional[List[str]] = None, + dashboard_dirs: Optional[List[str]] = None, + refresh_events: Optional[List] = None, + *, + scrape_configs: Optional[Union[List[dict], Callable]] = None, + ): + """Create a COSAgentProvider instance. + + Args: + charm: The `CharmBase` instance that is instantiating this object. + relation_name: The name of the relation to communicate over. + metrics_endpoints: List of endpoints in the form [{"path": path, "port": port}, ...]. + This argument is a simplified form of the `scrape_configs`. + The contents of this list will be merged with the contents of `scrape_configs`. + metrics_rules_dir: Directory where the metrics rules are stored. + logs_rules_dir: Directory where the logs rules are stored. + recurse_rules_dirs: Whether to recurse into rule paths. + log_slots: Snap slots to connect to for scraping logs + in the form ["snap-name:slot", ...]. + dashboard_dirs: Directory where the dashboards are stored. + refresh_events: List of events on which to refresh relation data. + scrape_configs: List of standard scrape_configs dicts or a callable + that returns the list in case the configs need to be generated dynamically. + The contents of this list will be merged with the contents of `metrics_endpoints`. + """ + super().__init__(charm, relation_name) + dashboard_dirs = dashboard_dirs or ["./src/grafana_dashboards"] + + self._charm = charm + self._relation_name = relation_name + self._metrics_endpoints = metrics_endpoints or [] + self._scrape_configs = scrape_configs or [] + self._metrics_rules = metrics_rules_dir + self._logs_rules = logs_rules_dir + self._recursive = recurse_rules_dirs + self._log_slots = log_slots or [] + self._dashboard_dirs = dashboard_dirs + self._refresh_events = refresh_events or [self._charm.on.config_changed] + + events = self._charm.on[relation_name] + self.framework.observe(events.relation_joined, self._on_refresh) + self.framework.observe(events.relation_changed, self._on_refresh) + for event in self._refresh_events: + self.framework.observe(event, self._on_refresh) + + def _on_refresh(self, event): + """Trigger the class to update relation data.""" + relations = self._charm.model.relations[self._relation_name] + + for relation in relations: + # Before a principal is related to the grafana-agent subordinate, we'd get + # ModelError: ERROR cannot read relation settings: unit "zk/2": settings not found + # Add a guard to make sure it doesn't happen. + if relation.data and self._charm.unit in relation.data: + # Subordinate relations can communicate only over unit data. + try: + data = CosAgentProviderUnitData( + metrics_alert_rules=self._metrics_alert_rules, + log_alert_rules=self._log_alert_rules, + dashboards=self._dashboards, + metrics_scrape_jobs=self._scrape_jobs, + log_slots=self._log_slots, + ) + relation.data[self._charm.unit][data.KEY] = data.json() + except ( + pydantic.ValidationError, + json.decoder.JSONDecodeError, + ) as e: + logger.error("Invalid relation data provided: %s", e) + + @property + def _scrape_jobs(self) -> List[Dict]: + """Return a prometheus_scrape-like data structure for jobs. + + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + """ + if callable(self._scrape_configs): + scrape_configs = self._scrape_configs() + else: + # Create a copy of the user scrape_configs, since we will mutate this object + scrape_configs = self._scrape_configs.copy() + + # Convert "metrics_endpoints" to standard scrape_configs, and add them in + for endpoint in self._metrics_endpoints: + scrape_configs.append( + { + "metrics_path": endpoint["path"], + "static_configs": [{"targets": [f"localhost:{endpoint['port']}"]}], + } + ) + + scrape_configs = scrape_configs or [DEFAULT_SCRAPE_CONFIG] + + # Augment job name to include the app name and a unique id (index) + for idx, scrape_config in enumerate(scrape_configs): + scrape_config["job_name"] = "_".join( + [self._charm.app.name, str(idx), scrape_config.get("job_name", "default")] + ) + + return scrape_configs + + @property + def _metrics_alert_rules(self) -> Dict: + """Use (for now) the prometheus_scrape AlertRules to initialize this.""" + alert_rules = AlertRules( + query_type="promql", topology=JujuTopology.from_charm(self._charm) + ) + alert_rules.add_path(self._metrics_rules, recursive=self._recursive) + return alert_rules.as_dict() + + @property + def _log_alert_rules(self) -> Dict: + """Use (for now) the loki_push_api AlertRules to initialize this.""" + alert_rules = AlertRules(query_type="logql", topology=JujuTopology.from_charm(self._charm)) + alert_rules.add_path(self._logs_rules, recursive=self._recursive) + return alert_rules.as_dict() + + @property + def _dashboards(self) -> List[GrafanaDashboard]: + dashboards: List[GrafanaDashboard] = [] + for d in self._dashboard_dirs: + for path in Path(d).glob("*"): + dashboard = GrafanaDashboard._serialize(path.read_bytes()) + dashboards.append(dashboard) + return dashboards + + +class COSAgentDataChanged(EventBase): + """Event emitted by `COSAgentRequirer` when relation data changes.""" + + +class COSAgentValidationError(EventBase): + """Event emitted by `COSAgentRequirer` when there is an error in the relation data.""" + + def __init__(self, handle, message: str = ""): + super().__init__(handle) + self.message = message + + def snapshot(self) -> Dict: + """Save COSAgentValidationError source information.""" + return {"message": self.message} + + def restore(self, snapshot): + """Restore COSAgentValidationError source information.""" + self.message = snapshot["message"] + + +class COSAgentRequirerEvents(ObjectEvents): + """`COSAgentRequirer` events.""" + + data_changed = EventSource(COSAgentDataChanged) + validation_error = EventSource(COSAgentValidationError) + + +class COSAgentRequirer(Object): + """Integration endpoint wrapper for the Requirer side of the cos_agent interface.""" + + on = COSAgentRequirerEvents() # pyright: ignore + + def __init__( + self, + charm: CharmType, + *, + relation_name: str = DEFAULT_RELATION_NAME, + peer_relation_name: str = DEFAULT_PEER_RELATION_NAME, + refresh_events: Optional[List[str]] = None, + ): + """Create a COSAgentRequirer instance. + + Args: + charm: The `CharmBase` instance that is instantiating this object. + relation_name: The name of the relation to communicate over. + peer_relation_name: The name of the peer relation to communicate over. + refresh_events: List of events on which to refresh relation data. + """ + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._peer_relation_name = peer_relation_name + self._refresh_events = refresh_events or [self._charm.on.config_changed] + + events = self._charm.on[relation_name] + self.framework.observe( + events.relation_joined, self._on_relation_data_changed + ) # TODO: do we need this? + self.framework.observe(events.relation_changed, self._on_relation_data_changed) + for event in self._refresh_events: + self.framework.observe(event, self.trigger_refresh) # pyright: ignore + + # Peer relation events + # A peer relation is needed as it is the only mechanism for exchanging data across + # subordinate units. + # self.framework.observe( + # self.on[self._peer_relation_name].relation_joined, self._on_peer_relation_joined + # ) + peer_events = self._charm.on[peer_relation_name] + self.framework.observe(peer_events.relation_changed, self._on_peer_relation_changed) + + @property + def peer_relation(self) -> Optional["Relation"]: + """Helper function for obtaining the peer relation object. + + Returns: peer relation object + (NOTE: would return None if called too early, e.g. during install). + """ + return self.model.get_relation(self._peer_relation_name) + + def _on_peer_relation_changed(self, _): + # Peer data is used for forwarding data from principal units to the grafana agent + # subordinate leader, for updating the app data of the outgoing o11y relations. + if self._charm.unit.is_leader(): + self.on.data_changed.emit() # pyright: ignore + + def _on_relation_data_changed(self, event: RelationChangedEvent): + # Peer data is the only means of communication between subordinate units. + if not self.peer_relation: + event.defer() + return + + cos_agent_relation = event.relation + if not event.unit or not cos_agent_relation.data.get(event.unit): + return + principal_unit = event.unit + + # Coherence check + units = cos_agent_relation.units + if len(units) > 1: + # should never happen + raise ValueError( + f"unexpected error: subordinate relation {cos_agent_relation} " + f"should have exactly one unit" + ) + + if not (raw := cos_agent_relation.data[principal_unit].get(CosAgentProviderUnitData.KEY)): + return + + if not (provider_data := self._validated_provider_data(raw)): + return + + # Copy data from the cos_agent relation to the peer relation, so the leader could + # follow up. + # Save the originating unit name, so it could be used for topology later on by the leader. + data = CosAgentPeersUnitData( # peer relation databag model + unit_name=event.unit.name, + relation_id=str(event.relation.id), + relation_name=event.relation.name, + metrics_alert_rules=provider_data.metrics_alert_rules, + log_alert_rules=provider_data.log_alert_rules, + dashboards=provider_data.dashboards, + ) + self.peer_relation.data[self._charm.unit][ + f"{CosAgentPeersUnitData.KEY}-{event.unit.name}" + ] = data.json() + + # We can't easily tell if the data that was changed is limited to only the data + # that goes into peer relation (in which case, if this is not a leader unit, we wouldn't + # need to emit `on.data_changed`), so we're emitting `on.data_changed` either way. + self.on.data_changed.emit() # pyright: ignore + + def _validated_provider_data(self, raw) -> Optional[CosAgentProviderUnitData]: + try: + return CosAgentProviderUnitData(**json.loads(raw)) + except (pydantic.ValidationError, json.decoder.JSONDecodeError) as e: + self.on.validation_error.emit(message=str(e)) # pyright: ignore + return None + + def trigger_refresh(self, _): + """Trigger a refresh of relation data.""" + # FIXME: Figure out what we should do here + self.on.data_changed.emit() # pyright: ignore + + @property + def _remote_data(self) -> List[Tuple[CosAgentProviderUnitData, JujuTopology]]: + """Return a list of remote data from each of the related units. + + Assumes that the relation is of type subordinate. + Relies on the fact that, for subordinate relations, the only remote unit visible to + *this unit* is the principal unit that this unit is attached to. + """ + all_data = [] + + for relation in self._charm.model.relations[self._relation_name]: + if not relation.units: + continue + unit = next(iter(relation.units)) + if not (raw := relation.data[unit].get(CosAgentProviderUnitData.KEY)): + continue + if not (provider_data := self._validated_provider_data(raw)): + continue + + topology = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=unit.app.name, + unit=unit.name, + ) + + all_data.append((provider_data, topology)) + + return all_data + + def _gather_peer_data(self) -> List[CosAgentPeersUnitData]: + """Collect data from the peers. + + Returns a trimmed-down list of CosAgentPeersUnitData. + """ + relation = self.peer_relation + + # Ensure that whatever context we're running this in, we take the necessary precautions: + if not relation or not relation.data or not relation.app: + return [] + + # Iterate over all peer unit data and only collect every principal once. + peer_data: List[CosAgentPeersUnitData] = [] + app_names: Set[str] = set() + + for unit in chain((self._charm.unit,), relation.units): + if not relation.data.get(unit): + continue + + for unit_name in relation.data.get(unit): # pyright: ignore + if not unit_name.startswith(CosAgentPeersUnitData.KEY): + continue + raw = relation.data[unit].get(unit_name) + if raw is None: + continue + data = CosAgentPeersUnitData(**json.loads(raw)) + # Have we already seen this principal app? + if (app_name := data.app_name) in app_names: + continue + peer_data.append(data) + app_names.add(app_name) + + return peer_data + + @property + def metrics_alerts(self) -> Dict[str, Any]: + """Fetch metrics alerts.""" + alert_rules = {} + + seen_apps: List[str] = [] + for data in self._gather_peer_data(): + if rules := data.metrics_alert_rules: + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + # This is only used for naming the file, so be as specific as we can be + identifier = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=app_name, + # For the topology unit, we could use `data.principal_unit_name`, but that unit + # name may not be very stable: `_gather_peer_data` de-duplicates by app name so + # the exact unit name that turns up first in the iterator may vary from time to + # time. So using the grafana-agent unit name instead. + unit=self._charm.unit.name, + ).identifier + + alert_rules[identifier] = rules + + return alert_rules + + @property + def metrics_jobs(self) -> List[Dict]: + """Parse the relation data contents and extract the metrics jobs.""" + scrape_jobs = [] + for data, topology in self._remote_data: + for job in data.metrics_scrape_jobs: + # In #220, relation schema changed from a simplified dict to the standard + # `scrape_configs`. + # This is to ensure backwards compatibility with Providers older than v0.5. + if "path" in job and "port" in job and "job_name" in job: + job = { + "job_name": job["job_name"], + "metrics_path": job["path"], + "static_configs": [{"targets": [f"localhost:{job['port']}"]}], + # We include insecure_skip_verify because we are always scraping localhost. + # Even if we have the certs for the scrape targets, we'd rather specify the scrape + # jobs with localhost rather than the SAN DNS the cert was issued for. + "tls_config": {"insecure_skip_verify": True}, + } + + # Apply labels to the scrape jobs + for static_config in job.get("static_configs", []): + topo_as_dict = topology.as_dict(excluded_keys=["charm_name"]) + static_config["labels"] = { + # Be sure to keep labels from static_config + **static_config.get("labels", {}), + # TODO: We should add a new method in juju_topology.py + # that like `as_dict` method, returns the keys with juju_ prefix + # https://github.com/canonical/cos-lib/issues/18 + **{ + "juju_{}".format(key): value + for key, value in topo_as_dict.items() + if value + }, + } + + scrape_jobs.append(job) + + return scrape_jobs + + @property + def snap_log_endpoints(self) -> List[SnapEndpoint]: + """Fetch logging endpoints exposed by related snaps.""" + plugs = [] + for data, _ in self._remote_data: + targets = data.log_slots + if targets: + for target in targets: + if target in plugs: + logger.warning( + f"plug {target} already listed. " + "The same snap is being passed from multiple " + "endpoints; this should not happen." + ) + else: + plugs.append(target) + + endpoints = [] + for plug in plugs: + if ":" not in plug: + logger.error(f"invalid plug definition received: {plug}. Ignoring...") + else: + endpoint = SnapEndpoint(*plug.split(":")) + endpoints.append(endpoint) + return endpoints + + @property + def logs_alerts(self) -> Dict[str, Any]: + """Fetch log alerts.""" + alert_rules = {} + seen_apps: List[str] = [] + + for data in self._gather_peer_data(): + if rules := data.log_alert_rules: + # This is only used for naming the file, so be as specific as we can be + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + + identifier = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=app_name, + # For the topology unit, we could use `data.unit_name`, but that unit + # name may not be very stable: `_gather_peer_data` de-duplicates by app name so + # the exact unit name that turns up first in the iterator may vary from time to + # time. So using the grafana-agent unit name instead. + unit=self._charm.unit.name, + ).identifier + + alert_rules[identifier] = rules + + return alert_rules + + @property + def dashboards(self) -> List[Dict[str, str]]: + """Fetch dashboards as encoded content. + + Dashboards are assumed not to vary across units of the same primary. + """ + dashboards: List[Dict[str, Any]] = [] + + seen_apps: List[str] = [] + for data in self._gather_peer_data(): + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + + for encoded_dashboard in data.dashboards or (): + content = GrafanaDashboard(encoded_dashboard)._deserialize() + + title = content.get("title", "no_title") + + dashboards.append( + { + "relation_id": data.relation_id, + # We have the remote charm name - use it for the identifier + "charm": f"{data.relation_name}-{app_name}", + "content": content, + "title": title, + } + ) + + return dashboards diff --git a/lib/charms/operator_libs_linux/v2/snap.py b/lib/charms/operator_libs_linux/v2/snap.py new file mode 100644 index 0000000..9d09a78 --- /dev/null +++ b/lib/charms/operator_libs_linux/v2/snap.py @@ -0,0 +1,1160 @@ +# Copyright 2021 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Representations of the system's Snaps, and abstractions around managing them. + +The `snap` module provides convenience methods for listing, installing, refreshing, and removing +Snap packages, in addition to setting and getting configuration options for them. + +In the `snap` module, `SnapCache` creates a dict-like mapping of `Snap` objects at when +instantiated. Installed snaps are fully populated, and available snaps are lazily-loaded upon +request. This module relies on an installed and running `snapd` daemon to perform operations over +the `snapd` HTTP API. + +`SnapCache` objects can be used to install or modify Snap packages by name in a manner similar to +using the `snap` command from the commandline. + +An example of adding Juju to the system with `SnapCache` and setting a config value: + +```python +try: + cache = snap.SnapCache() + juju = cache["juju"] + + if not juju.present: + juju.ensure(snap.SnapState.Latest, channel="beta") + juju.set({"some.key": "value", "some.key2": "value2"}) +except snap.SnapError as e: + logger.error("An exception occurred when installing charmcraft. Reason: %s", e.message) +``` + +In addition, the `snap` module provides "bare" methods which can act on Snap packages as +simple function calls. :meth:`add`, :meth:`remove`, and :meth:`ensure` are provided, as +well as :meth:`add_local` for installing directly from a local `.snap` file. These return +`Snap` objects. + +As an example of installing several Snaps and checking details: + +```python +try: + nextcloud, charmcraft = snap.add(["nextcloud", "charmcraft"]) + if nextcloud.get("mode") != "production": + nextcloud.set({"mode": "production"}) +except snap.SnapError as e: + logger.error("An exception occurred when installing snaps. Reason: %s" % e.message) +``` +""" + +import http.client +import json +import logging +import os +import re +import socket +import subprocess +import sys +import urllib.error +import urllib.parse +import urllib.request +from collections.abc import Mapping +from datetime import datetime, timedelta, timezone +from enum import Enum +from subprocess import CalledProcessError, CompletedProcess +from typing import Any, Dict, Iterable, List, Optional, Union + +logger = logging.getLogger(__name__) + +# The unique Charmhub library identifier, never change it +LIBID = "05394e5893f94f2d90feb7cbe6b633cd" + +# Increment this major API version when introducing breaking changes +LIBAPI = 2 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 7 + + +# Regex to locate 7-bit C1 ANSI sequences +ansi_filter = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") + + +def _cache_init(func): + def inner(*args, **kwargs): + if _Cache.cache is None: + _Cache.cache = SnapCache() + return func(*args, **kwargs) + + return inner + + +# recursive hints seems to error out pytest +JSONType = Union[Dict[str, Any], List[Any], str, int, float] + + +class SnapService: + """Data wrapper for snap services.""" + + def __init__( + self, + daemon: Optional[str] = None, + daemon_scope: Optional[str] = None, + enabled: bool = False, + active: bool = False, + activators: List[str] = [], + **kwargs, + ): + self.daemon = daemon + self.daemon_scope = kwargs.get("daemon-scope", None) or daemon_scope + self.enabled = enabled + self.active = active + self.activators = activators + + def as_dict(self) -> Dict: + """Return instance representation as dict.""" + return { + "daemon": self.daemon, + "daemon_scope": self.daemon_scope, + "enabled": self.enabled, + "active": self.active, + "activators": self.activators, + } + + +class MetaCache(type): + """MetaCache class used for initialising the snap cache.""" + + @property + def cache(cls) -> "SnapCache": + """Property for returning the snap cache.""" + return cls._cache + + @cache.setter + def cache(cls, cache: "SnapCache") -> None: + """Setter for the snap cache.""" + cls._cache = cache + + def __getitem__(cls, name) -> "Snap": + """Snap cache getter.""" + return cls._cache[name] + + +class _Cache(object, metaclass=MetaCache): + _cache = None + + +class Error(Exception): + """Base class of most errors raised by this library.""" + + def __repr__(self): + """Represent the Error class.""" + return "<{}.{} {}>".format(type(self).__module__, type(self).__name__, self.args) + + @property + def name(self): + """Return a string representation of the model plus class.""" + return "<{}.{}>".format(type(self).__module__, type(self).__name__) + + @property + def message(self): + """Return the message passed as an argument.""" + return self.args[0] + + +class SnapAPIError(Error): + """Raised when an HTTP API error occurs talking to the Snapd server.""" + + def __init__(self, body: Dict, code: int, status: str, message: str): + super().__init__(message) # Makes str(e) return message + self.body = body + self.code = code + self.status = status + self._message = message + + def __repr__(self): + """Represent the SnapAPIError class.""" + return "APIError({!r}, {!r}, {!r}, {!r})".format( + self.body, self.code, self.status, self._message + ) + + +class SnapState(Enum): + """The state of a snap on the system or in the cache.""" + + Present = "present" + Absent = "absent" + Latest = "latest" + Available = "available" + + +class SnapError(Error): + """Raised when there's an error running snap control commands.""" + + +class SnapNotFoundError(Error): + """Raised when a requested snap is not known to the system.""" + + +class Snap(object): + """Represents a snap package and its properties. + + `Snap` exposes the following properties about a snap: + - name: the name of the snap + - state: a `SnapState` representation of its install status + - channel: "stable", "candidate", "beta", and "edge" are common + - revision: a string representing the snap's revision + - confinement: "classic", "strict", or "devmode" + """ + + def __init__( + self, + name, + state: SnapState, + channel: str, + revision: str, + confinement: str, + apps: Optional[List[Dict[str, str]]] = None, + cohort: Optional[str] = "", + ) -> None: + self._name = name + self._state = state + self._channel = channel + self._revision = revision + self._confinement = confinement + self._cohort = cohort + self._apps = apps or [] + self._snap_client = SnapClient() + + def __eq__(self, other) -> bool: + """Equality for comparison.""" + return isinstance(other, self.__class__) and ( + self._name, + self._revision, + ) == (other._name, other._revision) + + def __hash__(self): + """Calculate a hash for this snap.""" + return hash((self._name, self._revision)) + + def __repr__(self): + """Represent the object such that it can be reconstructed.""" + return "<{}.{}: {}>".format(self.__module__, self.__class__.__name__, self.__dict__) + + def __str__(self): + """Represent the snap object as a string.""" + return "<{}: {}-{}.{} -- {}>".format( + self.__class__.__name__, + self._name, + self._revision, + self._channel, + str(self._state), + ) + + def _snap(self, command: str, optargs: Optional[Iterable[str]] = None) -> str: + """Perform a snap operation. + + Args: + command: the snap command to execute + optargs: an (optional) list of additional arguments to pass, + commonly confinement or channel + + Raises: + SnapError if there is a problem encountered + """ + optargs = optargs or [] + args = ["snap", command, self._name, *optargs] + try: + return subprocess.check_output(args, universal_newlines=True) + except CalledProcessError as e: + raise SnapError( + "Snap: {!r}; command {!r} failed with output = {!r}".format( + self._name, args, e.output + ) + ) + + def _snap_daemons( + self, + command: List[str], + services: Optional[List[str]] = None, + ) -> CompletedProcess: + """Perform snap app commands. + + Args: + command: the snap command to execute + services: the snap service to execute command on + + Raises: + SnapError if there is a problem encountered + """ + if services: + # an attempt to keep the command constrained to the snap instance's services + services = ["{}.{}".format(self._name, service) for service in services] + else: + services = [self._name] + + args = ["snap", *command, *services] + + try: + return subprocess.run(args, universal_newlines=True, check=True, capture_output=True) + except CalledProcessError as e: + raise SnapError("Could not {} for snap [{}]: {}".format(args, self._name, e.stderr)) + + def get(self, key: Optional[str], *, typed: bool = False) -> Any: + """Fetch snap configuration values. + + Args: + key: the key to retrieve. Default to retrieve all values for typed=True. + typed: set to True to retrieve typed values (set with typed=True). + Default is to return a string. + """ + if typed: + args = ["-d"] + if key: + args.append(key) + config = json.loads(self._snap("get", args)) + if key: + return config.get(key) + return config + + if not key: + raise TypeError("Key must be provided when typed=False") + + return self._snap("get", [key]).strip() + + def set(self, config: Dict[str, Any], *, typed: bool = False) -> str: + """Set a snap configuration value. + + Args: + config: a dictionary containing keys and values specifying the config to set. + typed: set to True to convert all values in the config into typed values while + configuring the snap (set with typed=True). Default is not to convert. + """ + if typed: + kv = [f"{key}={json.dumps(val)}" for key, val in config.items()] + return self._snap("set", ["-t"] + kv) + + return self._snap("set", [f"{key}={val}" for key, val in config.items()]) + + def unset(self, key) -> str: + """Unset a snap configuration value. + + Args: + key: the key to unset + """ + return self._snap("unset", [key]) + + def start(self, services: Optional[List[str]] = None, enable: Optional[bool] = False) -> None: + """Start a snap's services. + + Args: + services (list): (optional) list of individual snap services to start (otherwise all) + enable (bool): (optional) flag to enable snap services on start. Default `false` + """ + args = ["start", "--enable"] if enable else ["start"] + self._snap_daemons(args, services) + + def stop(self, services: Optional[List[str]] = None, disable: Optional[bool] = False) -> None: + """Stop a snap's services. + + Args: + services (list): (optional) list of individual snap services to stop (otherwise all) + disable (bool): (optional) flag to disable snap services on stop. Default `False` + """ + args = ["stop", "--disable"] if disable else ["stop"] + self._snap_daemons(args, services) + + def logs(self, services: Optional[List[str]] = None, num_lines: Optional[int] = 10) -> str: + """Fetch a snap services' logs. + + Args: + services (list): (optional) list of individual snap services to show logs from + (otherwise all) + num_lines (int): (optional) integer number of log lines to return. Default `10` + """ + args = ["logs", "-n={}".format(num_lines)] if num_lines else ["logs"] + return self._snap_daemons(args, services).stdout + + def connect( + self, plug: str, service: Optional[str] = None, slot: Optional[str] = None + ) -> None: + """Connect a plug to a slot. + + Args: + plug (str): the plug to connect + service (str): (optional) the snap service name to plug into + slot (str): (optional) the snap service slot to plug in to + + Raises: + SnapError if there is a problem encountered + """ + command = ["connect", "{}:{}".format(self._name, plug)] + + if service and slot: + command = command + ["{}:{}".format(service, slot)] + elif slot: + command = command + [slot] + + args = ["snap", *command] + try: + subprocess.run(args, universal_newlines=True, check=True, capture_output=True) + except CalledProcessError as e: + raise SnapError("Could not {} for snap [{}]: {}".format(args, self._name, e.stderr)) + + def hold(self, duration: Optional[timedelta] = None) -> None: + """Add a refresh hold to a snap. + + Args: + duration: duration for the hold, or None (the default) to hold this snap indefinitely. + """ + hold_str = "forever" + if duration is not None: + seconds = round(duration.total_seconds()) + hold_str = f"{seconds}s" + self._snap("refresh", [f"--hold={hold_str}"]) + + def unhold(self) -> None: + """Remove the refresh hold of a snap.""" + self._snap("refresh", ["--unhold"]) + + def alias(self, application: str, alias: Optional[str] = None) -> None: + """Create an alias for a given application. + + Args: + application: application to get an alias. + alias: (optional) name of the alias; if not provided, the application name is used. + """ + if alias is None: + alias = application + args = ["snap", "alias", f"{self.name}.{application}", alias] + try: + subprocess.check_output(args, universal_newlines=True) + except CalledProcessError as e: + raise SnapError( + "Snap: {!r}; command {!r} failed with output = {!r}".format( + self._name, args, e.output + ) + ) + + def restart( + self, services: Optional[List[str]] = None, reload: Optional[bool] = False + ) -> None: + """Restarts a snap's services. + + Args: + services (list): (optional) list of individual snap services to restart. + (otherwise all) + reload (bool): (optional) flag to use the service reload command, if available. + Default `False` + """ + args = ["restart", "--reload"] if reload else ["restart"] + self._snap_daemons(args, services) + + def _install( + self, + channel: Optional[str] = "", + cohort: Optional[str] = "", + revision: Optional[str] = None, + ) -> None: + """Add a snap to the system. + + Args: + channel: the channel to install from + cohort: optional, the key of a cohort that this snap belongs to + revision: optional, the revision of the snap to install + """ + cohort = cohort or self._cohort + + args = [] + if self.confinement == "classic": + args.append("--classic") + if self.confinement == "devmode": + args.append("--devmode") + if channel: + args.append('--channel="{}"'.format(channel)) + if revision: + args.append('--revision="{}"'.format(revision)) + if cohort: + args.append('--cohort="{}"'.format(cohort)) + + self._snap("install", args) + + def _refresh( + self, + channel: Optional[str] = "", + cohort: Optional[str] = "", + revision: Optional[str] = None, + devmode: bool = False, + leave_cohort: Optional[bool] = False, + ) -> None: + """Refresh a snap. + + Args: + channel: the channel to install from + cohort: optionally, specify a cohort. + revision: optionally, specify the revision of the snap to refresh + devmode: optionally, specify devmode confinement + leave_cohort: leave the current cohort. + """ + args = [] + if channel: + args.append('--channel="{}"'.format(channel)) + + if revision: + args.append('--revision="{}"'.format(revision)) + + if devmode: + args.append("--devmode") + + if not cohort: + cohort = self._cohort + + if leave_cohort: + self._cohort = "" + args.append("--leave-cohort") + elif cohort: + args.append('--cohort="{}"'.format(cohort)) + + self._snap("refresh", args) + + def _remove(self) -> str: + """Remove a snap from the system.""" + return self._snap("remove") + + @property + def name(self) -> str: + """Returns the name of the snap.""" + return self._name + + def ensure( + self, + state: SnapState, + classic: Optional[bool] = False, + devmode: bool = False, + channel: Optional[str] = "", + cohort: Optional[str] = "", + revision: Optional[str] = None, + ): + """Ensure that a snap is in a given state. + + Args: + state: a `SnapState` to reconcile to. + classic: an (Optional) boolean indicating whether classic confinement should be used + devmode: an (Optional) boolean indicating whether devmode confinement should be used + channel: the channel to install from + cohort: optional. Specify the key of a snap cohort. + revision: optional. the revision of the snap to install/refresh + + While both channel and revision could be specified, the underlying snap install/refresh + command will determine which one takes precedence (revision at this time) + + Raises: + SnapError if an error is encountered + """ + if classic and devmode: + raise ValueError("Cannot set both classic and devmode confinement") + + if classic or self._confinement == "classic": + self._confinement = "classic" + elif devmode or self._confinement == "devmode": + self._confinement = "devmode" + else: + self._confinement = "" + + if state not in (SnapState.Present, SnapState.Latest): + # We are attempting to remove this snap. + if self._state in (SnapState.Present, SnapState.Latest): + # The snap is installed, so we run _remove. + self._remove() + else: + # The snap is not installed -- no need to do anything. + pass + else: + # We are installing or refreshing a snap. + if self._state not in (SnapState.Present, SnapState.Latest): + # The snap is not installed, so we install it. + logger.info( + "Installing snap %s, revision %s, tracking %s", self._name, revision, channel + ) + self._install(channel, cohort, revision) + logger.info("The snap installation completed successfully") + elif revision is None or revision != self._revision: + # The snap is installed, but we are changing it (e.g., switching channels). + logger.info( + "Refreshing snap %s, revision %s, tracking %s", self._name, revision, channel + ) + self._refresh(channel=channel, cohort=cohort, revision=revision, devmode=devmode) + logger.info("The snap refresh completed successfully") + else: + logger.info("Refresh of snap %s was unnecessary", self._name) + + self._update_snap_apps() + self._state = state + + def _update_snap_apps(self) -> None: + """Update a snap's apps after snap changes state.""" + try: + self._apps = self._snap_client.get_installed_snap_apps(self._name) + except SnapAPIError: + logger.debug("Unable to retrieve snap apps for {}".format(self._name)) + self._apps = [] + + @property + def present(self) -> bool: + """Report whether or not a snap is present.""" + return self._state in (SnapState.Present, SnapState.Latest) + + @property + def latest(self) -> bool: + """Report whether the snap is the most recent version.""" + return self._state is SnapState.Latest + + @property + def state(self) -> SnapState: + """Report the current snap state.""" + return self._state + + @state.setter + def state(self, state: SnapState) -> None: + """Set the snap state to a given value. + + Args: + state: a `SnapState` to reconcile the snap to. + + Raises: + SnapError if an error is encountered + """ + if self._state is not state: + self.ensure(state) + self._state = state + + @property + def revision(self) -> str: + """Returns the revision for a snap.""" + return self._revision + + @property + def channel(self) -> str: + """Returns the channel for a snap.""" + return self._channel + + @property + def confinement(self) -> str: + """Returns the confinement for a snap.""" + return self._confinement + + @property + def apps(self) -> List: + """Returns (if any) the installed apps of the snap.""" + self._update_snap_apps() + return self._apps + + @property + def services(self) -> Dict: + """Returns (if any) the installed services of the snap.""" + self._update_snap_apps() + services = {} + for app in self._apps: + if "daemon" in app: + services[app["name"]] = SnapService(**app).as_dict() + + return services + + @property + def held(self) -> bool: + """Report whether the snap has a hold.""" + info = self._snap("info") + return "hold:" in info + + +class _UnixSocketConnection(http.client.HTTPConnection): + """Implementation of HTTPConnection that connects to a named Unix socket.""" + + def __init__(self, host, timeout=None, socket_path=None): + if timeout is None: + super().__init__(host) + else: + super().__init__(host, timeout=timeout) + self.socket_path = socket_path + + def connect(self): + """Override connect to use Unix socket (instead of TCP socket).""" + if not hasattr(socket, "AF_UNIX"): + raise NotImplementedError("Unix sockets not supported on {}".format(sys.platform)) + self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + self.sock.connect(self.socket_path) + if self.timeout is not None: + self.sock.settimeout(self.timeout) + + +class _UnixSocketHandler(urllib.request.AbstractHTTPHandler): + """Implementation of HTTPHandler that uses a named Unix socket.""" + + def __init__(self, socket_path: str): + super().__init__() + self.socket_path = socket_path + + def http_open(self, req) -> http.client.HTTPResponse: + """Override http_open to use a Unix socket connection (instead of TCP).""" + return self.do_open(_UnixSocketConnection, req, socket_path=self.socket_path) + + +class SnapClient: + """Snapd API client to talk to HTTP over UNIX sockets. + + In order to avoid shelling out and/or involving sudo in calling the snapd API, + use a wrapper based on the Pebble Client, trimmed down to only the utility methods + needed for talking to snapd. + """ + + def __init__( + self, + socket_path: str = "/run/snapd.socket", + opener: Optional[urllib.request.OpenerDirector] = None, + base_url: str = "http://localhost/v2/", + timeout: float = 30.0, + ): + """Initialize a client instance. + + Args: + socket_path: a path to the socket on the filesystem. Defaults to /run/snap/snapd.socket + opener: specifies an opener for unix socket, if unspecified a default is used + base_url: base url for making requests to the snap client. Defaults to + http://localhost/v2/ + timeout: timeout in seconds to use when making requests to the API. Default is 30.0s. + """ + if opener is None: + opener = self._get_default_opener(socket_path) + self.opener = opener + self.base_url = base_url + self.timeout = timeout + + @classmethod + def _get_default_opener(cls, socket_path): + """Build the default opener to use for requests (HTTP over Unix socket).""" + opener = urllib.request.OpenerDirector() + opener.add_handler(_UnixSocketHandler(socket_path)) + opener.add_handler(urllib.request.HTTPDefaultErrorHandler()) + opener.add_handler(urllib.request.HTTPRedirectHandler()) + opener.add_handler(urllib.request.HTTPErrorProcessor()) + return opener + + def _request( + self, + method: str, + path: str, + query: Dict = None, + body: Dict = None, + ) -> JSONType: + """Make a JSON request to the Snapd server with the given HTTP method and path. + + If query dict is provided, it is encoded and appended as a query string + to the URL. If body dict is provided, it is serialied as JSON and used + as the HTTP body (with Content-Type: "application/json"). The resulting + body is decoded from JSON. + """ + headers = {"Accept": "application/json"} + data = None + if body is not None: + data = json.dumps(body).encode("utf-8") + headers["Content-Type"] = "application/json" + + response = self._request_raw(method, path, query, headers, data) + return json.loads(response.read().decode())["result"] + + def _request_raw( + self, + method: str, + path: str, + query: Dict = None, + headers: Dict = None, + data: bytes = None, + ) -> http.client.HTTPResponse: + """Make a request to the Snapd server; return the raw HTTPResponse object.""" + url = self.base_url + path + if query: + url = url + "?" + urllib.parse.urlencode(query) + + if headers is None: + headers = {} + request = urllib.request.Request(url, method=method, data=data, headers=headers) + + try: + response = self.opener.open(request, timeout=self.timeout) + except urllib.error.HTTPError as e: + code = e.code + status = e.reason + message = "" + try: + body = json.loads(e.read().decode())["result"] + except (IOError, ValueError, KeyError) as e2: + # Will only happen on read error or if Pebble sends invalid JSON. + body = {} + message = "{} - {}".format(type(e2).__name__, e2) + raise SnapAPIError(body, code, status, message) + except urllib.error.URLError as e: + raise SnapAPIError({}, 500, "Not found", e.reason) + return response + + def get_installed_snaps(self) -> Dict: + """Get information about currently installed snaps.""" + return self._request("GET", "snaps") + + def get_snap_information(self, name: str) -> Dict: + """Query the snap server for information about single snap.""" + return self._request("GET", "find", {"name": name})[0] + + def get_installed_snap_apps(self, name: str) -> List: + """Query the snap server for apps belonging to a named, currently installed snap.""" + return self._request("GET", "apps", {"names": name, "select": "service"}) + + +class SnapCache(Mapping): + """An abstraction to represent installed/available packages. + + When instantiated, `SnapCache` iterates through the list of installed + snaps using the `snapd` HTTP API, and a list of available snaps by reading + the filesystem to populate the cache. Information about available snaps is lazily-loaded + from the `snapd` API when requested. + """ + + def __init__(self): + if not self.snapd_installed: + raise SnapError("snapd is not installed or not in /usr/bin") from None + self._snap_client = SnapClient() + self._snap_map = {} + if self.snapd_installed: + self._load_available_snaps() + self._load_installed_snaps() + + def __contains__(self, key: str) -> bool: + """Check if a given snap is in the cache.""" + return key in self._snap_map + + def __len__(self) -> int: + """Report number of items in the snap cache.""" + return len(self._snap_map) + + def __iter__(self) -> Iterable["Snap"]: + """Provide iterator for the snap cache.""" + return iter(self._snap_map.values()) + + def __getitem__(self, snap_name: str) -> Snap: + """Return either the installed version or latest version for a given snap.""" + snap = self._snap_map.get(snap_name, None) + if snap is None: + # The snapd cache file may not have existed when _snap_map was + # populated. This is normal. + try: + self._snap_map[snap_name] = self._load_info(snap_name) + except SnapAPIError: + raise SnapNotFoundError("Snap '{}' not found!".format(snap_name)) + + return self._snap_map[snap_name] + + @property + def snapd_installed(self) -> bool: + """Check whether snapd has been installled on the system.""" + return os.path.isfile("/usr/bin/snap") + + def _load_available_snaps(self) -> None: + """Load the list of available snaps from disk. + + Leave them empty and lazily load later if asked for. + """ + if not os.path.isfile("/var/cache/snapd/names"): + # The snap catalog may not be populated yet; this is normal. + # snapd updates the cache infrequently and the cache file may not + # currently exist. + return + + with open("/var/cache/snapd/names", "r") as f: + for line in f: + if line.strip(): + self._snap_map[line.strip()] = None + + def _load_installed_snaps(self) -> None: + """Load the installed snaps into the dict.""" + installed = self._snap_client.get_installed_snaps() + + for i in installed: + snap = Snap( + name=i["name"], + state=SnapState.Latest, + channel=i["channel"], + revision=i["revision"], + confinement=i["confinement"], + apps=i.get("apps", None), + ) + self._snap_map[snap.name] = snap + + def _load_info(self, name) -> Snap: + """Load info for snaps which are not installed if requested. + + Args: + name: a string representing the name of the snap + """ + info = self._snap_client.get_snap_information(name) + + return Snap( + name=info["name"], + state=SnapState.Available, + channel=info["channel"], + revision=info["revision"], + confinement=info["confinement"], + apps=None, + ) + + +@_cache_init +def add( + snap_names: Union[str, List[str]], + state: Union[str, SnapState] = SnapState.Latest, + channel: Optional[str] = "", + classic: Optional[bool] = False, + devmode: bool = False, + cohort: Optional[str] = "", + revision: Optional[str] = None, +) -> Union[Snap, List[Snap]]: + """Add a snap to the system. + + Args: + snap_names: the name or names of the snaps to install + state: a string or `SnapState` representation of the desired state, one of + [`Present` or `Latest`] + channel: an (Optional) channel as a string. Defaults to 'latest' + classic: an (Optional) boolean specifying whether it should be added with classic + confinement. Default `False` + devmode: an (Optional) boolean specifying whether it should be added with devmode + confinement. Default `False` + cohort: an (Optional) string specifying the snap cohort to use + revision: an (Optional) string specifying the snap revision to use + + Raises: + SnapError if some snaps failed to install or were not found. + """ + if not channel and not revision: + channel = "latest" + + snap_names = [snap_names] if isinstance(snap_names, str) else snap_names + if not snap_names: + raise TypeError("Expected at least one snap to add, received zero!") + + if isinstance(state, str): + state = SnapState(state) + + return _wrap_snap_operations(snap_names, state, channel, classic, devmode, cohort, revision) + + +@_cache_init +def remove(snap_names: Union[str, List[str]]) -> Union[Snap, List[Snap]]: + """Remove specified snap(s) from the system. + + Args: + snap_names: the name or names of the snaps to install + + Raises: + SnapError if some snaps failed to install. + """ + snap_names = [snap_names] if isinstance(snap_names, str) else snap_names + if not snap_names: + raise TypeError("Expected at least one snap to add, received zero!") + return _wrap_snap_operations( + snap_names=snap_names, + state=SnapState.Absent, + channel="", + classic=False, + devmode=False, + ) + + +@_cache_init +def ensure( + snap_names: Union[str, List[str]], + state: str, + channel: Optional[str] = "", + classic: Optional[bool] = False, + devmode: bool = False, + cohort: Optional[str] = "", + revision: Optional[int] = None, +) -> Union[Snap, List[Snap]]: + """Ensure specified snaps are in a given state on the system. + + Args: + snap_names: the name(s) of the snaps to operate on + state: a string representation of the desired state, from `SnapState` + channel: an (Optional) channel as a string. Defaults to 'latest' + classic: an (Optional) boolean specifying whether it should be added with classic + confinement. Default `False` + devmode: an (Optional) boolean specifying whether it should be added with devmode + confinement. Default `False` + cohort: an (Optional) string specifying the snap cohort to use + revision: an (Optional) integer specifying the snap revision to use + + When both channel and revision are specified, the underlying snap install/refresh + command will determine the precedence (revision at the time of adding this) + + Raises: + SnapError if the snap is not in the cache. + """ + if not revision and not channel: + channel = "latest" + + if state in ("present", "latest") or revision: + return add( + snap_names=snap_names, + state=SnapState(state), + channel=channel, + classic=classic, + devmode=devmode, + cohort=cohort, + revision=revision, + ) + else: + return remove(snap_names) + + +def _wrap_snap_operations( + snap_names: List[str], + state: SnapState, + channel: str, + classic: bool, + devmode: bool, + cohort: Optional[str] = "", + revision: Optional[str] = None, +) -> Union[Snap, List[Snap]]: + """Wrap common operations for bare commands.""" + snaps = {"success": [], "failed": []} + + op = "remove" if state is SnapState.Absent else "install or refresh" + + for s in snap_names: + try: + snap = _Cache[s] + if state is SnapState.Absent: + snap.ensure(state=SnapState.Absent) + else: + snap.ensure( + state=state, + classic=classic, + devmode=devmode, + channel=channel, + cohort=cohort, + revision=revision, + ) + snaps["success"].append(snap) + except SnapError as e: + logger.warning("Failed to {} snap {}: {}!".format(op, s, e.message)) + snaps["failed"].append(s) + except SnapNotFoundError: + logger.warning("Snap '{}' not found in cache!".format(s)) + snaps["failed"].append(s) + + if len(snaps["failed"]): + raise SnapError( + "Failed to install or refresh snap(s): {}".format(", ".join(list(snaps["failed"]))) + ) + + return snaps["success"] if len(snaps["success"]) > 1 else snaps["success"][0] + + +def install_local( + filename: str, + classic: Optional[bool] = False, + devmode: Optional[bool] = False, + dangerous: Optional[bool] = False, +) -> Snap: + """Perform a snap operation. + + Args: + filename: the path to a local .snap file to install + classic: whether to use classic confinement + devmode: whether to use devmode confinement + dangerous: whether --dangerous should be passed to install snaps without a signature + + Raises: + SnapError if there is a problem encountered + """ + args = [ + "snap", + "install", + filename, + ] + if classic: + args.append("--classic") + if devmode: + args.append("--devmode") + if dangerous: + args.append("--dangerous") + try: + result = subprocess.check_output(args, universal_newlines=True).splitlines()[-1] + snap_name, _ = result.split(" ", 1) + snap_name = ansi_filter.sub("", snap_name) + + c = SnapCache() + + try: + return c[snap_name] + except SnapAPIError as e: + logger.error( + "Could not find snap {} when querying Snapd socket: {}".format(snap_name, e.body) + ) + raise SnapError("Failed to find snap {} in Snap cache".format(snap_name)) + except CalledProcessError as e: + raise SnapError("Could not install snap {}: {}".format(filename, e.output)) + + +def _system_set(config_item: str, value: str) -> None: + """Set system snapd config values. + + Args: + config_item: name of snap system setting. E.g. 'refresh.hold' + value: value to assign + """ + args = ["snap", "set", "system", "{}={}".format(config_item, value)] + try: + subprocess.check_call(args, universal_newlines=True) + except CalledProcessError: + raise SnapError("Failed setting system config '{}' to '{}'".format(config_item, value)) + + +def hold_refresh(days: int = 90, forever: bool = False) -> bool: + """Set the system-wide snap refresh hold. + + Args: + days: number of days to hold system refreshes for. Maximum 90. Set to zero to remove hold. + forever: if True, will set a hold forever. + """ + if not isinstance(forever, bool): + raise TypeError("forever must be a bool") + if not isinstance(days, int): + raise TypeError("days must be an int") + if forever: + _system_set("refresh.hold", "forever") + logger.info("Set system-wide snap refresh hold to: forever") + elif days == 0: + _system_set("refresh.hold", "") + logger.info("Removed system-wide snap refresh hold") + else: + # Currently the snap daemon can only hold for a maximum of 90 days + if not 1 <= days <= 90: + raise ValueError("days must be between 1 and 90") + # Add the number of days to current time + target_date = datetime.now(timezone.utc).astimezone() + timedelta(days=days) + # Format for the correct datetime format + hold_date = target_date.strftime("%Y-%m-%dT%H:%M:%S%z") + # Python dumps the offset in format '+0100', we need '+01:00' + hold_date = "{0}:{1}".format(hold_date[:-2], hold_date[-2:]) + # Actually set the hold date + _system_set("refresh.hold", hold_date) + logger.info("Set system-wide snap refresh hold to: %s", hold_date) diff --git a/src/charm.py b/src/charm.py index c5d34b0..fb0f830 100755 --- a/src/charm.py +++ b/src/charm.py @@ -26,6 +26,7 @@ Slurmrestd, SlurmrestdAvailableEvent, ) +from charms.grafana_agent.v0.cos_agent import COSAgentProvider from ops import ( ActionEvent, ActiveStatus, @@ -69,6 +70,12 @@ def __init__(self, *args): self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") + self._grafana_agent = COSAgentProvider( + self, + metrics_endpoints=[{"path": "/metrics", "port": 9092}], + metrics_rules_dir="./src/alert_rules/prometheus", + recurse_rules_dirs=True, + ) event_handler_bindings = { self.on.install: self._on_install, @@ -112,6 +119,7 @@ def _on_install(self, event: InstallEvent) -> None: self._slurmctld_manager.start_slurmctld() self.unit.set_workload_version(self._slurmctld_manager.version()) + prometheus.install() self.slurm_installed = True else: self.unit.status = BlockedStatus("Only singleton slurmctld is supported.") diff --git a/src/prometheus.py b/src/prometheus.py new file mode 100644 index 0000000..41d061a --- /dev/null +++ b/src/prometheus.py @@ -0,0 +1,46 @@ +import logging +import os +import shutil +import subprocess +from pathlib import Path + +import charms.operator_libs_linux.v1.systemd as systemd +import charms.operator_libs_linux.v2.snap as snap + +logger = logging.getLogger() + +REPO = "github.com/rivosinc/prometheus-slurm-exporter" +TAG = "v1.5.1" + +TEMPLATE_DIR = Path(os.path.dirname(os.path.abspath(__file__))) / "templates" + +SERVICE_NAME = "prometheus-slurm-exporter.service" + +ENDPOINT = "http://localhost:9092/metrics" + + +def install(): + try: + snap.add("go", classic=True, channel="1.22/stable") + except snap.SnapError as e: + logger.error(f"Failed to install go snap. Reason: {e.message}") + raise e + + try: + env = { + **os.environ, + "GOBIN": "/usr/bin", + "GOPATH": "/root/go", + "GOCACHE": "/root/.cache/go-build", + } + subprocess.run( + ["go", "install", f"{REPO}@{TAG}"], check=True, env=env, capture_output=True + ) + except subprocess.CalledProcessError as e: + logger.error(f"Failed to install prometheus exporter. Reason: {e.stderr}") + raise e + + shutil.copyfile(TEMPLATE_DIR / SERVICE_NAME, f"/etc/systemd/system/{SERVICE_NAME}") + + systemd.daemon_reload() + systemd.service_enable(SERVICE_NAME, "--now") diff --git a/src/templates/prometheus-slurm-exporter.service b/src/templates/prometheus-slurm-exporter.service new file mode 100644 index 0000000..271a099 --- /dev/null +++ b/src/templates/prometheus-slurm-exporter.service @@ -0,0 +1,14 @@ +Description=Service for prometheus-slurm-exporter +Wants=network.target +After=network.target slurmctld.service + +[Service] +ExecStart=/usr/bin/prometheus-slurm-exporter +SyslogIdentifier=prometheus-slurm-exporter +Restart=always +RestartSec=15 +TimeoutStopSec=30 +Type=simple + +[Install] +WantedBy=multi-user.target