From 674fe99203395ed4629d8c90bd50dbb437f5d968 Mon Sep 17 00:00:00 2001 From: Ryan Barry Date: Wed, 20 Jul 2022 10:47:59 -0400 Subject: [PATCH] Forward Loki alert rules as well as Prometheus alert rules (#54) * Stop mypy from going crazy on our tests here, too * Update libs * [DRAFT] dynamic method binding * Forward Loki alerts also * Check Prometheus alert forwarding in integration tests too * Re-add self-monitoring after merge * Clean up merge, make mypy quiet * No static checking of tests * Trust prometheus * Get the right number of rules with self-monitoring present --- .github/workflows/release-edge.yaml | 8 +- .gitignore | 1 + .../observability_libs/v0/juju_topology.py | 306 ++++++++++++++++++ pyproject.toml | 6 +- src/charm.py | 87 +++-- .../grafana_agent_high_rate.rule | 10 + tests/integration/conftest.py | 82 ++++- tests/integration/helpers.py | 177 +++++++++- tests/integration/loki-tester/.flake8 | 9 + tests/integration/loki-tester/.gitignore | 7 + tests/integration/loki-tester/.jujuignore | 3 + tests/integration/loki-tester/README.md | 46 +++ tests/integration/loki-tester/actions.yaml | 7 + tests/integration/loki-tester/charmcraft.yaml | 10 + tests/integration/loki-tester/metadata.yaml | 17 + .../loki-tester/requirements-dev.txt | 1 + .../integration/loki-tester/requirements.txt | 2 + tests/integration/loki-tester/src/charm.py | 174 ++++++++++ .../src/loki_alert_rules/alert_on_error.rule | 7 + .../free-standing/alerting.rule | 23 ++ .../integration/prometheus-tester/.jujuignore | 4 + .../prometheus-tester/charmcraft.yaml | 12 + .../integration/prometheus-tester/config.yaml | 12 + .../prometheus-tester/metadata.yaml | 21 ++ .../prometheus-tester/requirements.txt | 1 + .../prometheus-tester/src/charm.py | 129 ++++++++ .../prometheus-tester/src/metrics.py | 33 ++ .../prometheus_alert_rules/cpu_overuse.rule | 8 + .../with_extra_alert_rule/cpu_overuse.rule | 8 + .../with_extra_alert_rule/target_missing.rule | 8 + tests/integration/prometheus.py | 148 ++++++++- tests/integration/test_charm.py | 24 +- tests/integration/test_forwards_alerts.py | 87 +++++ tests/integration/test_kubectl_delete.py | 6 +- tests/integration/test_upgrade_charm.py | 8 +- tests/unit/test_alerts.py | 248 ++++++++++++++ ..._charm.py => test_scrape_configuration.py} | 11 +- tox.ini | 10 +- 38 files changed, 1695 insertions(+), 66 deletions(-) create mode 100644 lib/charms/observability_libs/v0/juju_topology.py create mode 100644 src/loki_alert_rules/grafana_agent_high_rate.rule create mode 100644 tests/integration/loki-tester/.flake8 create mode 100644 tests/integration/loki-tester/.gitignore create mode 100644 tests/integration/loki-tester/.jujuignore create mode 100644 tests/integration/loki-tester/README.md create mode 100644 tests/integration/loki-tester/actions.yaml create mode 100644 tests/integration/loki-tester/charmcraft.yaml create mode 100644 tests/integration/loki-tester/metadata.yaml create mode 100644 tests/integration/loki-tester/requirements-dev.txt create mode 100644 tests/integration/loki-tester/requirements.txt create mode 100755 tests/integration/loki-tester/src/charm.py create mode 100644 tests/integration/loki-tester/src/loki_alert_rules/alert_on_error.rule create mode 100644 tests/integration/loki-tester/src/loki_alert_rules/free-standing/alerting.rule create mode 100644 tests/integration/prometheus-tester/.jujuignore create mode 100644 tests/integration/prometheus-tester/charmcraft.yaml create mode 100644 tests/integration/prometheus-tester/config.yaml create mode 100644 tests/integration/prometheus-tester/metadata.yaml create mode 100644 tests/integration/prometheus-tester/requirements.txt create mode 100755 tests/integration/prometheus-tester/src/charm.py create mode 100644 tests/integration/prometheus-tester/src/metrics.py create mode 100644 tests/integration/prometheus-tester/src/prometheus_alert_rules/cpu_overuse.rule create mode 100644 tests/integration/prometheus-tester/src/with_extra_alert_rule/cpu_overuse.rule create mode 100644 tests/integration/prometheus-tester/src/with_extra_alert_rule/target_missing.rule create mode 100755 tests/integration/test_forwards_alerts.py create mode 100644 tests/unit/test_alerts.py rename tests/unit/{test_charm.py => test_scrape_configuration.py} (94%) diff --git a/.github/workflows/release-edge.yaml b/.github/workflows/release-edge.yaml index 25a78688..a46bc124 100644 --- a/.github/workflows/release-edge.yaml +++ b/.github/workflows/release-edge.yaml @@ -36,7 +36,7 @@ jobs: run: python3 -m pip install tox - name: Run static analysis for /lib for 3.5 run: tox -vve static-lib - + static-charm: name: Static analysis of the charm and tests runs-on: ubuntu-latest @@ -49,11 +49,7 @@ jobs: run: python3 -m pip install tox - name: Run static analysis (charm) run: tox -vve static-charm - - name: Run static analysis (unit tests) - run: tox -vve static-unit - - name: Run static analysis (integration tests) - run: tox -vve static-integration - + lint: name: Lint runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 6778fb32..4746d70c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ __pycache__/ *.py[cod] .tox .idea/ +tests/integration/*-tester/lib/ diff --git a/lib/charms/observability_libs/v0/juju_topology.py b/lib/charms/observability_libs/v0/juju_topology.py new file mode 100644 index 00000000..c985b1e7 --- /dev/null +++ b/lib/charms/observability_libs/v0/juju_topology.py @@ -0,0 +1,306 @@ +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. +"""## Overview. + +This document explains how to use the `JujuTopology` class to +create and consume topology information from Juju in a consistent manner. + +The goal of the Juju topology is to uniquely identify a piece +of software running across any of your Juju-managed deployments. +This is achieved by combining the following four elements: + +- Model name +- Model UUID +- Application name +- Unit identifier + + +For a more in-depth description of the concept, as well as a +walk-through of it's use-case in observability, see +[this blog post](https://juju.is/blog/model-driven-observability-part-2-juju-topology-metrics) +on the Juju blog. + +## Library Usage + +This library may be used to create and consume `JujuTopology` objects. +The `JujuTopology` class provides three ways to create instances: + +### Using the `from_charm` method + +Enables instantiation by supplying the charm as an argument. When +creating topology objects for the current charm, this is the recommended +approach. + +```python +topology = JujuTopology.from_charm(self) +``` + +### Using the `from_dict` method + +Allows for instantion using a dictionary of relation data, like the +`scrape_metadata` from Prometheus or the labels of an alert rule. When +creating topology objects for remote charms, this is the recommended +approach. + +```python +scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) +topology = JujuTopology.from_dict(scrape_metadata) +``` + +### Using the class constructor + +Enables instantiation using whatever values you want. While this +is useful in some very specific cases, this is almost certainly not +what you are looking for as setting these values manually may +result in observability metrics which do not uniquely identify a +charm in order to provide accurate usage reporting, alerting, +horizontal scaling, or other use cases. + +```python +topology = JujuTopology( + model="some-juju-model", + model_uuid="00000000-0000-0000-0000-000000000001", + application="fancy-juju-application", + unit="fancy-juju-application/0", + charm_name="fancy-juju-application-k8s", +) +``` + +""" + +import re +from collections import OrderedDict +from typing import Dict, List, Optional + +# The unique Charmhub library identifier, never change it +LIBID = "bced1658f20f49d28b88f61f83c2d232" + +LIBAPI = 0 +LIBPATCH = 2 + + +class InvalidUUIDError(Exception): + """Invalid UUID was provided.""" + + def __init__(self, uuid: str): + self.message = "'{}' is not a valid UUID.".format(uuid) + super().__init__(self.message) + + +class JujuTopology: + """JujuTopology is used for storing, generating and formatting juju topology information.""" + + def __init__( + self, + model: str, + model_uuid: str, + application: str, + unit: str = None, + charm_name: str = None, + ): + """Build a JujuTopology object. + + A `JujuTopology` object is used for storing and transforming + Juju topology information. This information is used to + annotate Prometheus scrape jobs and alert rules. Such + annotation when applied to scrape jobs helps in identifying + the source of the scrapped metrics. On the other hand when + applied to alert rules topology information ensures that + evaluation of alert expressions is restricted to the source + (charm) from which the alert rules were obtained. + + Args: + model: a string name of the Juju model + model_uuid: a globally unique string identifier for the Juju model + application: an application name as a string + unit: a unit name as a string + charm_name: name of charm as a string + """ + if not self.is_valid_uuid(model_uuid): + raise InvalidUUIDError(model_uuid) + + self._model = model + self._model_uuid = model_uuid + self._application = application + self._charm_name = charm_name + self._unit = unit + + def is_valid_uuid(self, uuid): + """Validate the supplied UUID against the Juju Model UUID pattern.""" + # TODO: + # Harness is harcoding an UUID that is v1 not v4: f2c1b2a6-e006-11eb-ba80-0242ac130004 + # See: https://github.com/canonical/operator/issues/779 + # + # >>> uuid.UUID("f2c1b2a6-e006-11eb-ba80-0242ac130004").version + # 1 + # + # we changed the validation of the 3ed UUID block: 4[a-f0-9]{3} -> [a-f0-9]{4} + # See: https://github.com/canonical/operator/blob/main/ops/testing.py#L1094 + # + # Juju in fact generates a UUID v4: https://github.com/juju/utils/blob/master/uuid.go#L62 + # but does not validate it is actually v4: + # See: + # - https://github.com/juju/utils/blob/master/uuid.go#L22 + # - https://github.com/juju/schema/blob/master/strings.go#L79 + # + # Once Harness fixes this, we should remove this comment and refactor the regex or + # the entire method using the uuid module to validate UUIDs + regex = re.compile( + "^[a-f0-9]{8}-?[a-f0-9]{4}-?[a-f0-9]{4}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}$" + ) + return bool(regex.match(uuid)) + + @classmethod + def from_charm(cls, charm): + """Creates a JujuTopology instance by using the model data available on a charm object. + + Args: + charm: a `CharmBase` object for which the `JujuTopology` will be constructed + Returns: + a `JujuTopology` object. + """ + return cls( + model=charm.model.name, + model_uuid=charm.model.uuid, + application=charm.model.app.name, + unit=charm.model.unit.name, + charm_name=charm.meta.name, + ) + + @classmethod + def from_dict(cls, data: dict): + """Factory method for creating `JujuTopology` children from a dictionary. + + Args: + data: a dictionary with five keys providing topology information. The keys are + - "model" + - "model_uuid" + - "application" + - "unit" + - "charm_name" + `unit` and `charm_name` may be empty, but will result in more limited + labels. However, this allows us to support charms without workloads. + + Returns: + a `JujuTopology` object. + """ + return cls( + model=data["model"], + model_uuid=data["model_uuid"], + application=data["application"], + unit=data.get("unit", ""), + charm_name=data.get("charm_name", ""), + ) + + def as_dict( + self, *, remapped_keys: Dict[str, str] = None, excluded_keys: List[str] = None + ) -> OrderedDict: + """Format the topology information into an ordered dict. + + Keeping the dictionary ordered is important to be able to + compare dicts without having to resort to deep comparisons. + + Args: + remapped_keys: A dictionary mapping old key names to new key names, + which will be substituted when invoked. + excluded_keys: A list of key names to exclude from the returned dict. + uuid_length: The length to crop the UUID to. + """ + ret = OrderedDict( + [ + ("model", self.model), + ("model_uuid", self.model_uuid), + ("application", self.application), + ("unit", self.unit), + ("charm_name", self.charm_name), + ] + ) + if excluded_keys: + ret = OrderedDict({k: v for k, v in ret.items() if k not in excluded_keys}) + + if remapped_keys: + ret = OrderedDict( + (remapped_keys.get(k), v) if remapped_keys.get(k) else (k, v) for k, v in ret.items() # type: ignore + ) + + return ret + + @property + def identifier(self) -> str: + """Format the topology information into a terse string. + + This crops the model UUID, making it unsuitable for comparisons against + anything but other identifiers. Mainly to be used as a display name or file + name where long strings might become an issue. + + >>> JujuTopology( \ + model = "a-model", \ + model_uuid = "00000000-0000-4000-8000-000000000000", \ + application = "some-app", \ + unit = "some-app/1" \ + ).identifier + 'a-model_00000000_some-app' + """ + parts = self.as_dict( + excluded_keys=["unit", "charm_name"], + ) + + parts["model_uuid"] = self.model_uuid_short + values = parts.values() + + return "_".join([str(val) for val in values]).replace("/", "_") + + @property + def label_matcher_dict(self) -> Dict[str, str]: + """Format the topology information into a dict with keys having 'juju_' as prefix. + + Relabelled topology never includes the unit as it would then only match + the leader unit (ie. the unit that produced the dict). + """ + items = self.as_dict( + remapped_keys={"charm_name": "charm"}, + excluded_keys=["unit"], + ).items() + + return {"juju_{}".format(key): value for key, value in items if value} + + @property + def label_matchers(self) -> str: + """Format the topology information into a promql/logql label matcher string. + + Topology label matchers should never include the unit as it + would then only match the leader unit (ie. the unit that + produced the matchers). + """ + items = self.label_matcher_dict.items() + return ", ".join(['{}="{}"'.format(key, value) for key, value in items if value]) + + @property + def model(self) -> str: + """Getter for the juju model value.""" + return self._model + + @property + def model_uuid(self) -> str: + """Getter for the juju model uuid value.""" + return self._model_uuid + + @property + def model_uuid_short(self) -> str: + """Getter for the juju model value, truncated to the first eight letters.""" + return self._model_uuid[:8] + + @property + def application(self) -> str: + """Getter for the juju application value.""" + return self._application + + @property + def charm_name(self) -> Optional[str]: + """Getter for the juju charm name value.""" + return self._charm_name + + @property + def unit(self) -> Optional[str]: + """Getter for the juju unit value.""" + return self._unit diff --git a/pyproject.toml b/pyproject.toml index 815bd296..8375910c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ select = ["E", "W", "F", "C", "N", "R", "D", "H"] # Ignore D107 Missing docstring in __init__ ignore = ["W503", "E501", "D107"] # D100, D101, D102, D103: Ignore missing docstrings in tests -per-file-ignores = ["tests/*:D100,D101,D102,D103"] +per-file-ignores = ["tests/*:D100,D101,D102,D103,C901", "lib/*:C901"] docstring-convention = "google" # Check for properly formatted copyright header in each file copyright-check = "True" @@ -38,7 +38,7 @@ copyright-regexp = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+%(author)s" [tool.mypy] pretty = true python_version = 3.8 -mypy_path = "$MYPY_CONFIG_FILE_DIR/src:$MYPY_CONFIG_FILE_DIR/lib:$MYPY_CONFIG_FILE_DIR/tests/unit" +mypy_path = "./src:./lib" follow_imports = "normal" warn_redundant_casts = true warn_unused_ignores = true @@ -63,4 +63,4 @@ follow_imports = "silent" [tool.pytest.ini_options] minversion = "6.0" log_cli_level = "INFO" -asyncio_mode = "auto" \ No newline at end of file +asyncio_mode = "auto" diff --git a/src/charm.py b/src/charm.py index 92b71d3f..d6d51e72 100755 --- a/src/charm.py +++ b/src/charm.py @@ -4,12 +4,12 @@ # See LICENSE file for licensing details. """A juju charm for Grafana Agent on Kubernetes.""" - import logging import os import pathlib import shutil -from typing import Any, Dict +from collections import namedtuple +from typing import Any, Callable, Dict import yaml from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider @@ -34,11 +34,15 @@ logger = logging.getLogger(__name__) CONFIG_PATH = "/etc/agent/agent.yaml" +LOKI_RULES_SRC_PATH = "./src/loki_alert_rules" +LOKI_RULES_DEST_PATH = "./loki_alert_rules" METRICS_RULES_SRC_PATH = "./src/prometheus_alert_rules" METRICS_RULES_DEST_PATH = "./prometheus_alert_rules" REMOTE_WRITE_RELATION_NAME = "send-remote-write" SCRAPE_RELATION_NAME = "metrics-endpoint" +RulesMapping = namedtuple("RulesMapping", ["src", "dest"]) + class GrafanaAgentReloadError(Exception): """Custom exception to indicate that grafana agent config couldn't be reloaded.""" @@ -59,8 +63,16 @@ class GrafanaAgentOperatorCharm(CharmBase): def __init__(self, *args): super().__init__(*args) self._container = self.unit.get_container(self._name) - self._metrics_rules_src_path = os.path.join(self.charm_dir, METRICS_RULES_SRC_PATH) - self._metrics_rules_dest_path = os.path.join(self.charm_dir, METRICS_RULES_DEST_PATH) + + self.loki_rules_paths = RulesMapping( + src=os.path.join(self.charm_dir, LOKI_RULES_SRC_PATH), + dest=os.path.join(self.charm_dir, LOKI_RULES_DEST_PATH), + ) + self.metrics_rules_paths = RulesMapping( + src=os.path.join(self.charm_dir, METRICS_RULES_SRC_PATH), + dest=os.path.join(self.charm_dir, METRICS_RULES_DEST_PATH), + ) + self.service_patch = KubernetesServicePatch( self, [ @@ -69,10 +81,9 @@ def __init__(self, *args): ], ) - if not os.path.isdir(self._metrics_rules_dest_path): - shutil.copytree( - self._metrics_rules_src_path, self._metrics_rules_dest_path, dirs_exist_ok=True - ) + for rules in [self.loki_rules_paths, self.metrics_rules_paths]: + if not os.path.isdir(rules.dest): + shutil.copytree(rules.src, rules.dest, dirs_exist_ok=True) # Self-monitoring self._scraping = MetricsEndpointProvider( @@ -83,26 +94,34 @@ def __init__(self, *args): self._grafana_dashboards = GrafanaDashboardProvider( self, relation_name="grafana-dashboard" ) + self._remote_write = PrometheusRemoteWriteConsumer( - self, alert_rules_path=self._metrics_rules_dest_path + self, alert_rules_path=self.metrics_rules_paths.dest ) self._scrape = MetricsEndpointConsumer(self) - self._loki_consumer = LokiPushApiConsumer(self, relation_name="logging-consumer") + self._loki_consumer = LokiPushApiConsumer( + self, relation_name="logging-consumer", alert_rules_path=self.loki_rules_paths.dest + ) self._loki_provider = LokiPushApiProvider( self, relation_name="logging-provider", port=self._http_listen_port ) self.framework.observe(self.on.agent_pebble_ready, self.on_pebble_ready) - self.framework.observe(self.on.upgrade_charm, self.update_alerts_rules) + self.framework.observe(self.on.upgrade_charm, self._metrics_alerts) + self.framework.observe(self.on.upgrade_charm, self._loki_alerts) self.framework.observe( self._remote_write.on.endpoints_changed, self.on_remote_write_changed ) - self.framework.observe(self._remote_write.on.endpoints_changed, self.update_alerts_rules) + self.framework.observe(self._remote_write.on.endpoints_changed, self._metrics_alerts) self.framework.observe(self._scrape.on.targets_changed, self.on_scrape_targets_changed) - self.framework.observe(self._scrape.on.targets_changed, self.update_alerts_rules) + self.framework.observe(self._scrape.on.targets_changed, self._metrics_alerts) + + self.framework.observe( + self._loki_provider.on.loki_push_api_alert_rules_changed, self._loki_alerts + ) self.framework.observe( self._loki_consumer.on.loki_push_api_endpoint_joined, @@ -113,18 +132,42 @@ def __init__(self, *args): self._on_loki_push_api_endpoint_departed, ) - def update_alerts_rules(self, _): + def _metrics_alerts(self, event): + self.update_alerts_rules( + event, + alerts_func=self._scrape.alerts, + reload_func=self._remote_write.reload_alerts, + mapping=self.metrics_rules_paths, + ) + + def _loki_alerts(self, event): + self.update_alerts_rules( + event, + alerts_func=self._loki_provider.alerts, + reload_func=self._loki_consumer._reinitialize_alert_rules, + mapping=self.loki_rules_paths, + ) + + def update_alerts_rules( + self, _, alerts_func: Any, reload_func: Callable, mapping: RulesMapping + ): """Copy alert rules from relations and save them to disk.""" - rules = self._scrape.alerts() - shutil.rmtree(self._metrics_rules_dest_path) - shutil.copytree(self._metrics_rules_src_path, self._metrics_rules_dest_path) + rules = {} + + # MetricsEndpointConsumer.alerts is not @property, but Loki is, so + # do the right thing + if callable(alerts_func): + rules = alerts_func() + else: + rules = alerts_func + + shutil.rmtree(mapping.dest) + shutil.copytree(mapping.src, mapping.dest) for topology_identifier, rule in rules.items(): - file_handle = pathlib.Path( - self._metrics_rules_dest_path, "juju_{}.rules".format(topology_identifier) - ) + file_handle = pathlib.Path(mapping.dest, "juju_{}.rules".format(topology_identifier)) file_handle.write_text(yaml.dump(rule)) logger.debug("updated alert rules file {}".format(file_handle.absolute())) - self._remote_write.reload_alerts() + reload_func() def _on_loki_push_api_endpoint_joined(self, event) -> None: """Event handler for the logging relation changed event.""" @@ -182,7 +225,7 @@ def _update_status(self) -> None: self.unit.status = ActiveStatus() - def _update_config(self, event=None): + def _update_config(self, _) -> None: if not self._container.can_connect(): # Pebble is not ready yet so no need to update config self.unit.status = WaitingStatus("waiting for agent container to start") diff --git a/src/loki_alert_rules/grafana_agent_high_rate.rule b/src/loki_alert_rules/grafana_agent_high_rate.rule new file mode 100644 index 00000000..d73893c3 --- /dev/null +++ b/src/loki_alert_rules/grafana_agent_high_rate.rule @@ -0,0 +1,10 @@ +groups: + - name: grafana-agent-high-log-volume + rules: + - alert: HighLogVolume + expr: | + count_over_time(({%%juju_topology%%})[30s]) > 100 + labels: + severity: high + annotations: + summary: Log rate is too high! diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index eb1bdf6f..6fbf9392 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,12 +1,90 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. +import functools +import logging +import os +import shutil +from collections import defaultdict +from datetime import datetime import pytest from pytest_operator.plugin import OpsTest +logger = logging.getLogger(__name__) + + +class Store(defaultdict): + def __init__(self): + super(Store, self).__init__(Store) + + def __getattr__(self, key): + """Override __getattr__ so dot syntax works on keys.""" + try: + return self[key] + except KeyError: + raise AttributeError(key) + + def __setattr__(self, key, value): + """Override __setattr__ so dot syntax works on keys.""" + self[key] = value + + +store = Store() + + +def timed_memoizer(func): + @functools.wraps(func) + async def wrapper(*args, **kwargs): + fname = func.__qualname__ + logger.info("Started: %s" % fname) + start_time = datetime.now() + if fname in store.keys(): + ret = store[fname] + else: + logger.info("Return for {} not cached".format(fname)) + ret = await func(*args, **kwargs) + store[fname] = ret + logger.info("Finished: {} in: {} seconds".format(fname, datetime.now() - start_time)) + return ret + + return wrapper + + +@pytest.fixture(scope="module", autouse=True) +def copy_libraries_into_test_charm(): + """Ensure that the tester charm uses the current Prometheus library.""" + testers = ["loki-tester", "prometheus-tester"] + for t in testers: + if os.path.exists(f"tests/integration/{t}/lib"): + shutil.rmtree(f"tests/integration/{t}/lib") + shutil.copytree("lib", f"tests/integration/{t}/lib") + @pytest.fixture(scope="module") -async def charm_under_test(ops_test: OpsTest): - """Charm used for integration testing.""" +@timed_memoizer +async def grafana_agent_charm(ops_test: OpsTest): + """Loki charm used for integration testing.""" charm = await ops_test.build_charm(".") return charm + + +@pytest.fixture(scope="module") +@timed_memoizer +async def loki_tester_charm(ops_test): + """A charm for integration test of the Loki charm.""" + charm_path = "tests/integration/loki-tester" + clean_cmd = ["charmcraft", "clean", "-p", charm_path] + await ops_test.run(*clean_cmd) + charm = await ops_test.build_charm(charm_path) + return charm + + +@pytest.fixture(scope="module") +@timed_memoizer +async def prometheus_tester_charm(ops_test): + """A charm for integration test of the Prometheus charm.""" + charm_path = "tests/integration/prometheus-tester" + clean_cmd = ["charmcraft", "clean", "-p", charm_path] + await ops_test.run(*clean_cmd) + charm = await ops_test.build_charm(charm_path) + return charm diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index debb7cb4..6d228354 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1,16 +1,123 @@ -#!/usr/bin/env python3 -# Copyright 2022 Canonical Ltd. +# Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. +import asyncio +import json import logging +import urllib.error +import urllib.request +from pathlib import Path from typing import List +import yaml from asyncstdlib import functools from grafana import Grafana from prometheus import Prometheus from pytest_operator.plugin import OpsTest -log = logging.getLogger(__name__) +logger = logging.getLogger(__name__) + + +async def get_unit_address(ops_test, app_name: str, unit_num: int) -> str: + status = await ops_test.model.get_status() # noqa: F821 + return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"] + + +async def is_loki_up(ops_test, app_name, num_units=1) -> bool: + # Sometimes get_unit_address returns a None, no clue why, so looping until it's not + addresses = [""] * num_units + while not all(addresses): + addresses = [await get_unit_address(ops_test, app_name, i) for i in range(num_units)] + + def get(url) -> bool: + response = urllib.request.urlopen(url, data=None, timeout=2.0) + return response.code == 200 and "version" in json.loads(response.read()) + + return all(get(f"http://{address}:3100/loki/api/v1/status/buildinfo") for address in addresses) + + +async def loki_rules(ops_test, app_name) -> dict: + address = await get_unit_address(ops_test, app_name, 0) + url = f"http://{address}:3100" + + try: + response = urllib.request.urlopen(f"{url}/loki/api/v1/rules", data=None, timeout=2.0) + if response.code == 200: + return yaml.safe_load(response.read()) + return {} + except urllib.error.HTTPError: + return {} + + +async def loki_alerts(ops_test: str, app_name: str, unit_num: int = 0, retries: int = 3) -> dict: + r"""Get a list of alerts from a Prometheus-compatible endpoint. + + Results look like: + { + "data": { + "groups": [ + { + "rules": [ + { + "alerts": [ + { + "activeAt": "2018-07-04T20:27:12.60602144+02:00", + "annotations": { + "summary": "High request latency" + }, + "labels": { + "alertname": "HighRequestLatency", + "severity": "page" + }, + "state": "firing", + "value": "1e+00" + } + ], + "annotations": { + "summary": "High request latency" + }, + "duration": 600, + "health": "ok", + "labels": { + "severity": "page" + }, + "name": "HighRequestLatency", + "query": "job:request_latency_seconds:mean5m{job=\"myjob\"} > 0.5", + "type": "alerting" + }, + { + "health": "ok", + "name": "job:http_inprogress_requests:sum", + "query": "sum by (job) (http_inprogress_requests)", + "type": "recording" + } + ], + "file": "/rules.yaml", + "interval": 60, + "limit": 0, + "name": "example" + } + ] + }, + "status": "success" + } + """ + address = await get_unit_address(ops_test, app_name, unit_num) + url = f"http://{address}:3100/prometheus/api/v1/alerts" + + # Retry since the endpoint may not _immediately_ return valid data + while not ( + alerts := json.loads(urllib.request.urlopen(url, data=None, timeout=2).read())["data"][ + "alerts" + ] + ): + retries -= 1 + if retries > 0: + await asyncio.sleep(2) + else: + break + + return alerts async def unit_address(ops_test: OpsTest, app_name: str, unit_num: int) -> str: @@ -28,6 +135,70 @@ async def unit_address(ops_test: OpsTest, app_name: str, unit_num: int) -> str: return status["applications"][app_name]["units"][f"{app_name}/{unit_num}"]["address"] +async def prometheus_rules(ops_test: OpsTest, app_name: str, unit_num: int) -> list: + """Fetch all Prometheus rules. + + Args: + ops_test: pytest-operator plugin + app_name: string name of Prometheus application + unit_num: integer number of a Prometheus juju unit + + Returns: + a list of rule groups. + """ + host = await unit_address(ops_test, app_name, unit_num) + prometheus = Prometheus(host=host) + rules = await prometheus.rules() + return rules + + +def oci_image(metadata_file: str, image_name: str) -> str: + """Find upstream source for a container image. + + Args: + metadata_file: string path of metadata YAML file relative + to top level charm directory + image_name: OCI container image string name as defined in + metadata.yaml file + Returns: + upstream image source + Raises: + FileNotFoundError: if metadata_file path is invalid + ValueError: if upstream source for image name can not be found + """ + metadata = yaml.safe_load(Path(metadata_file).read_text()) + + resources = metadata.get("resources", {}) + if not resources: + raise ValueError("No resources found") + + image = resources.get(image_name, {}) + if not image: + raise ValueError("{} image not found".format(image_name)) + + upstream_source = image.get("upstream-source", "") + if not upstream_source: + raise ValueError("Upstream source not found") + + return upstream_source + + +def initial_workload_is_ready(ops_test, app_names) -> bool: + """Checks that the initial workload (ie. x/0) is ready. + + Args: + ops_test: pytest-operator plugin + app_names: array of application names to check for + + Returns: + whether the workloads are active or not + """ + return all( + ops_test.model.applications[name].units[0].workload_status == "active" + for name in app_names + ) + + @functools.cache async def unit_password(ops_test: OpsTest, app_name: str, unit_num: int) -> str: """Get the admin password for a unit. Memoize it to reduce turnaround time. diff --git a/tests/integration/loki-tester/.flake8 b/tests/integration/loki-tester/.flake8 new file mode 100644 index 00000000..8ef84fcd --- /dev/null +++ b/tests/integration/loki-tester/.flake8 @@ -0,0 +1,9 @@ +[flake8] +max-line-length = 99 +select: E,W,F,C,N +exclude: + venv + .git + build + dist + *.egg_info diff --git a/tests/integration/loki-tester/.gitignore b/tests/integration/loki-tester/.gitignore new file mode 100644 index 00000000..2c3f0e5e --- /dev/null +++ b/tests/integration/loki-tester/.gitignore @@ -0,0 +1,7 @@ +venv/ +build/ +*.charm + +.coverage +__pycache__/ +*.py[cod] diff --git a/tests/integration/loki-tester/.jujuignore b/tests/integration/loki-tester/.jujuignore new file mode 100644 index 00000000..6ccd559e --- /dev/null +++ b/tests/integration/loki-tester/.jujuignore @@ -0,0 +1,3 @@ +/venv +*.py[cod] +*.charm diff --git a/tests/integration/loki-tester/README.md b/tests/integration/loki-tester/README.md new file mode 100644 index 00000000..2aa054a7 --- /dev/null +++ b/tests/integration/loki-tester/README.md @@ -0,0 +1,46 @@ +# loki-tester + +## Description + +This charm generates synthetic logs that may be used by the Loki +Operator and used for the purposes of integration testing the +Loki operator. The synthetic data is actually this charm's own +Python debug logs. + +## Usage + +Build the Loki Tester charm using `charmcraft pack` in the +`tests/integration/loki-tester` directory. + +Deploy the Loki charm and Loki Tester charm and add a relation +between them. + +``` +juju deploy loki-k8s --channel=beta +juju deploy ./loki-tester_ubuntu-20.04-amd64.charm +juju relate loki-k8s loki-tester +``` + +Query logs sent by Loki tester to Loki +``` +curl -G -s http://$(lokiaddr):3100/loki/api/v1/query_range --data-urlencode "query={logger=\"Loki-Tester\"}" +``` +Note `$(lokiaddr)` is the IP address of the deployed Loki application. + +Query the alert rules sent by the Loki tester to Loki +``` +curl -G -s http://$(lokiaddr):3100/prometheus/api/v1/rules +``` + +Make Loki tester send an error log +``` +juju run-action loki-tester/0 log-error message="some error message" +``` + +Check a if the `log-error` action is triggering an alert by querrying +the alerts raised by Loki. +``` +curl -G -s http://$(lokiaddr):3100/prometheus/api/v1/alerts +``` +You may need to run this a couple of times before you see the alert because +there is a time lag between running the action and the alert triggering. diff --git a/tests/integration/loki-tester/actions.yaml b/tests/integration/loki-tester/actions.yaml new file mode 100644 index 00000000..096be650 --- /dev/null +++ b/tests/integration/loki-tester/actions.yaml @@ -0,0 +1,7 @@ +log-error: + description: Log an error message. + params: + message: + description: Error message to be logged. + type: string + default: "" diff --git a/tests/integration/loki-tester/charmcraft.yaml b/tests/integration/loki-tester/charmcraft.yaml new file mode 100644 index 00000000..048d4544 --- /dev/null +++ b/tests/integration/loki-tester/charmcraft.yaml @@ -0,0 +1,10 @@ +# Learn more about charmcraft.yaml configuration at: +# https://juju.is/docs/sdk/charmcraft-config +type: "charm" +bases: + - build-on: + - name: "ubuntu" + channel: "20.04" + run-on: + - name: "ubuntu" + channel: "20.04" diff --git a/tests/integration/loki-tester/metadata.yaml b/tests/integration/loki-tester/metadata.yaml new file mode 100644 index 00000000..53659630 --- /dev/null +++ b/tests/integration/loki-tester/metadata.yaml @@ -0,0 +1,17 @@ +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +# For a complete list of supported options, see: +# https://juju.is/docs/sdk/metadata-reference +name: loki-tester +display-name: loki-tester +description: | + An integration tester for the Loki Operator. +summary: | + This charm exercises the functionality offerred by the Loki Operator + in order to integration test it. +platforms: + - kubernetes +requires: + logging: + interface: loki_push_api diff --git a/tests/integration/loki-tester/requirements-dev.txt b/tests/integration/loki-tester/requirements-dev.txt new file mode 100644 index 00000000..bc04b496 --- /dev/null +++ b/tests/integration/loki-tester/requirements-dev.txt @@ -0,0 +1 @@ +-r requirements.txt diff --git a/tests/integration/loki-tester/requirements.txt b/tests/integration/loki-tester/requirements.txt new file mode 100644 index 00000000..346bcdfb --- /dev/null +++ b/tests/integration/loki-tester/requirements.txt @@ -0,0 +1,2 @@ +ops >= 1.2.0 +python-logging-loki diff --git a/tests/integration/loki-tester/src/charm.py b/tests/integration/loki-tester/src/charm.py new file mode 100755 index 00000000..a17c99bb --- /dev/null +++ b/tests/integration/loki-tester/src/charm.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +"""A Integration tester charm for Loki Operator.""" + +import logging +from multiprocessing import Queue + +import logging_loki # type: ignore +from charms.loki_k8s.v0.loki_push_api import LokiPushApiConsumer +from charms.observability_libs.v0.juju_topology import JujuTopology +from ops.charm import CharmBase +from ops.main import main +from ops.model import ActiveStatus + + +class LokiTesterCharm(CharmBase): + """A Loki Operator Client Charm.""" + + def __init__(self, *args): + super().__init__(*args) + + self._loki_consumer = LokiPushApiConsumer(self) + + self.framework.observe(self.on.config_changed, self._on_config_changed) + self.framework.observe(self.on.update_status, self._on_update_status) + self.framework.observe(self.on.log_error_action, self._on_log_error_action) + self.framework.observe( + self._loki_consumer.on.loki_push_api_endpoint_joined, + self._on_loki_push_api_endpoint_joined, + ) + self.framework.observe( + self._loki_consumer.on.loki_push_api_endpoint_departed, + self._on_loki_push_api_endpoint_departed, + ) + + self.topology = JujuTopology.from_charm(self) + self.unit.status = ActiveStatus() + + def _setup_logging(self, handlers_init: dict = None) -> None: + """Ensure logging is configured correctly. + + A dict of "wanted" loggers is passed in, and the list of current loggers known to + `logging.getLogger()` is compared. If the "wanted" loggers are not part of that list, + or that list has loggers which are not "wanted", it is reconciled via `addHandler()` and + `removeHandler()`. Python's `logging` objects are actually global variables, and this + add/remove is a necessary step to ensure the state is "correct" rather than blindly + adding/removing them. + + Args: + handlers_init: a dict of 'name' -> handler objects + """ + handlers_init = handlers_init or {} + logger = logging.getLogger("loki-tester") + + # Make sure we always have a local console logger + console_handler = {"console": {"handler": logging.StreamHandler(), "level": logging.INFO}} + handlers_init.update(console_handler) + + for k, v in handlers_init.items(): + # Give each handler a "name" property which matches so we can find it + v["handler"].name = k + v["handler"].setLevel(v["level"]) + + handlers = {v["handler"].name: v["handler"] for v in handlers_init.values()} + + # Check against logger.Manager and exclude "useless" values like logging.Placeholder + existing_handlers: dict[str, logging.Handler] = {k: v for k, v in logger.manager.loggerDict.items() if not isinstance(v, logging.PlaceHolder) and "loki" not in k} # type: ignore + + if set(handlers.keys()) == set(existing_handlers.keys()): + # Nothing to do + return + + # If we're here, we need to add or remove some loggers + to_remove = [v for k, v in existing_handlers.items() if k not in handlers] + to_add = [v for k, v in handlers.items() if k not in existing_handlers] + + # Remove loggers we don't want anymore + for h in to_remove: + logger.removeHandler(h) + + # Add any missing loggers whichshould be there + for h in to_add: + logger.addHandler(h) + + self.logger = logger + self.logger.debug( + "Configured logging with {} handlers: {}".format( + len(handlers.keys()), ", ".join(handlers.keys()) + ) + ) + + def _on_config_changed(self, _): + """Handle changed configuration.""" + self.set_logger() + self.log("debug", "Handling configuration change") + + def _on_update_status(self, _): + """Handle status updates.""" + self.set_logger() + self.log("debug", "Updating status") + + def _on_loki_push_api_endpoint_joined(self, _): + self.set_logger() + self.log("debug", "Loki push API endpoint joined") + + def _on_loki_push_api_endpoint_departed(self, _): + self.set_logger() + self.log("debug", "Loki push API endpoint departed") + + def _on_log_error_action(self, event): + self.set_logger() + message = event.params["message"] + logged = self.log("error", message) + if logged: + self.logger.warning("Error message logged!") + event.set_results({"message": "Error message successfully logged"}) + else: + self.logger.warning("Error message not logged!") + event.fail("Failed to log error message") + + def set_logger(self, local_only=False): + """Set self.log to a meaningful value. + + Set the log attribute for this charm. There is a `local_only` param which + can be used on RelationBroken or RelationDeparted where leaving a "remote" + Loki logger may lead to attempts to send a request to a dying endpoint, + and `local_only` will ensure that it is only to the console of this charm + to isolate behavior. + + If `local_only` is not set, try to fetch a list of Loki endpoints and set + the logger for this charm to match. If the endpoint list is empty, then + it will be console only. + + Args: + local_only: a boolean to enable only local console logging + """ + if local_only: + self._setup_logging({}) + return + + tags = self.topology.label_matcher_dict + log_endpoints = self._loki_consumer.loki_endpoints + + loki_handlers = {} + for idx, endpoint in enumerate(log_endpoints): + logging_loki.emitter.LokiEmitter.level_tag = "level" + loki_handlers.update( + { + "loki-{}".format(idx): { + "handler": logging_loki.LokiQueueHandler( + Queue(-1), url=endpoint["url"], version="1", tags=dict(tags) + ), + "level": logging.DEBUG, + } + } + ) + + self._setup_logging(loki_handlers) + + if loki_handlers: + self.log("debug", "Successfully set Loki Logger") + + def log(self, level, msg): + try: + getattr(self.logger, level)(msg) + return True + except AttributeError: + return False + + +if __name__ == "__main__": + main(LokiTesterCharm) diff --git a/tests/integration/loki-tester/src/loki_alert_rules/alert_on_error.rule b/tests/integration/loki-tester/src/loki_alert_rules/alert_on_error.rule new file mode 100644 index 00000000..98a86e9d --- /dev/null +++ b/tests/integration/loki-tester/src/loki_alert_rules/alert_on_error.rule @@ -0,0 +1,7 @@ +alert: AlertOnError +expr: | + count_over_time(({logger="loki-tester", level="error", %%juju_topology%%})[10m]) > 0 +labels: + severity: high +annotations: + summary: An error message has been logged diff --git a/tests/integration/loki-tester/src/loki_alert_rules/free-standing/alerting.rule b/tests/integration/loki-tester/src/loki_alert_rules/free-standing/alerting.rule new file mode 100644 index 00000000..a6720dda --- /dev/null +++ b/tests/integration/loki-tester/src/loki_alert_rules/free-standing/alerting.rule @@ -0,0 +1,23 @@ +groups: + - name: should_fire + rules: + - alert: HighPercentageError + expr: | + sum(rate({app="foo", env="production"} |= "error" [5m])) by (job) + / + sum(rate({app="foo", env="production"}[5m])) by (job) + > 0.05 + for: 10m + labels: + severity: page + annotations: + summary: High request latency + - name: credentials_leak + rules: + - alert: http-credentials-leaked + annotations: + message: "{{ $labels.job }} is leaking http basic auth credentials." + expr: 'sum by (cluster, job, pod) (count_over_time({namespace="prod"} |~ "http(s?)://(\\w+):(\\w+)@" [5m]) > 0)' + for: 10m + labels: + severity: critical diff --git a/tests/integration/prometheus-tester/.jujuignore b/tests/integration/prometheus-tester/.jujuignore new file mode 100644 index 00000000..de6e6ae5 --- /dev/null +++ b/tests/integration/prometheus-tester/.jujuignore @@ -0,0 +1,4 @@ +/venv +*.py[cod] +*.charm +tester diff --git a/tests/integration/prometheus-tester/charmcraft.yaml b/tests/integration/prometheus-tester/charmcraft.yaml new file mode 100644 index 00000000..f35c5da5 --- /dev/null +++ b/tests/integration/prometheus-tester/charmcraft.yaml @@ -0,0 +1,12 @@ +type: charm +bases: + - build-on: + - name: "ubuntu" + channel: "20.04" + run-on: + - name: "ubuntu" + channel: "20.04" +parts: + charm: + build-packages: + - git diff --git a/tests/integration/prometheus-tester/config.yaml b/tests/integration/prometheus-tester/config.yaml new file mode 100644 index 00000000..62b606c2 --- /dev/null +++ b/tests/integration/prometheus-tester/config.yaml @@ -0,0 +1,12 @@ +# Copyright 2021 Canonical Ltd +# See LICENSE file for licensing details. + +options: + scrape-interval: + default: 1s + description: "Prometheus job configuration for scrape interval" + type: string + alert-rules-path: + default: src/prometheus_alert_rules + description: "Path for alert rules passed to the Provider" + type: string diff --git a/tests/integration/prometheus-tester/metadata.yaml b/tests/integration/prometheus-tester/metadata.yaml new file mode 100644 index 00000000..2eb536aa --- /dev/null +++ b/tests/integration/prometheus-tester/metadata.yaml @@ -0,0 +1,21 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +name: prometheus-tester +description: | + This charm generates test data which is ingested by the + Prometheus operator +summary: | + A charm to test the Prometheus operator +platforms: + - kubernetes +containers: + prometheus-tester: + resource: prometheus-tester-image +resources: + prometheus-tester-image: + type: oci-image + description: upstream docker image for prometheus-tester + upstream-source: python:slim +provides: + metrics-endpoint: + interface: prometheus_scrape diff --git a/tests/integration/prometheus-tester/requirements.txt b/tests/integration/prometheus-tester/requirements.txt new file mode 100644 index 00000000..5abb4f08 --- /dev/null +++ b/tests/integration/prometheus-tester/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/canonical/operator#egg=ops diff --git a/tests/integration/prometheus-tester/src/charm.py b/tests/integration/prometheus-tester/src/charm.py new file mode 100755 index 00000000..4cc559df --- /dev/null +++ b/tests/integration/prometheus-tester/src/charm.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +"""A Charm to functionally test the Prometheus Operator.""" + +import logging +from pathlib import Path + +from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider +from ops.charm import CharmBase +from ops.main import main +from ops.model import ActiveStatus, BlockedStatus +from ops.pebble import ChangeError, ExecError, Layer + +logger = logging.getLogger(__name__) + + +class PrometheusTesterCharm(CharmBase): + """A Charm used to test the Prometheus charm.""" + + def __init__(self, *args): + super().__init__(*args) + self._name = "prometheus-tester" + self._pip_path = "/usr/local/bin/pip" + self._metrics_exporter_script = Path("src/metrics.py") + jobs = [ + { + "scrape_interval": self.model.config["scrape-interval"], + "static_configs": [{"targets": ["*:8000"], "labels": {"name": self._name}}], + } + ] + self.prometheus = MetricsEndpointProvider( + self, jobs=jobs, alert_rules_path=self.model.config["alert-rules-path"] + ) + self.framework.observe( + self.on.prometheus_tester_pebble_ready, self._on_prometheus_tester_pebble_ready + ) + self.framework.observe(self.on.config_changed, self._on_config_changed) + + def _on_prometheus_tester_pebble_ready(self, event): + """Install the metrics exporter script and its dependencies.""" + container = event.workload + + self._install_prometheus_client() + metrics_endpoint_script = self._metrics_exporter() + container.push("/metrics.py", metrics_endpoint_script) + logger.info("Pushed metrics exporter") + + layer = self._tester_pebble_layer() + container.add_layer(self._name, layer, combine=True) + container.restart(self._name) + + self.unit.status = ActiveStatus() + + def _on_config_changed(self, event): + """Reconfigure the Prometheus tester.""" + container = self.unit.get_container(self._name) + if not container.can_connect(): + self.unit.status = BlockedStatus("Waiting for Pebble ready") + return + + self._install_prometheus_client() + metrics_endpoint_script = self._metrics_exporter() + container.push("/metrics.py", metrics_endpoint_script) + logger.info("Pushed metrics exporter") + + current_services = container.get_plan().services + new_layer = self._tester_pebble_layer() + if current_services != new_layer.services: + container.add_layer(self._name, new_layer, combine=True) + logger.debug("Added tester layer to container") + + container.restart(self._name) + logger.info("Restarted tester service") + + self.unit.status = ActiveStatus() + + def _tester_pebble_layer(self): + """Generate Prometheus tester pebble layer.""" + layer_spec = { + "summary": "prometheus tester", + "description": "a test data generator for Prometheus", + "services": { + self._name: { + "override": "replace", + "summary": "metrics exporter service", + "command": "python /metrics.py", + "startup": "enabled", + } + }, + } + return Layer(layer_spec) + + def _install_prometheus_client(self): + """Install Prometheus tester dependencies.""" + container = self.unit.get_container(self._name) + if not container.can_connect(): + self.unit.status = BlockedStatus("Waiting for Pebble ready") + return + + process = container.exec([self._pip_path, "install", "prometheus_client"]) + try: + _, stderr = process.wait_output() + logger.debug("Installed prometheus client") + if stderr: + logger.debug(stderr) + return + + except ExecError as e: + logger.error( + "Failed to install prometheus client: exited with code %d. Stderr:", e.exit_code + ) + for line in e.stderr.splitlines(): + logger.error(" %s", line) + self.unit.status = BlockedStatus("Failed to install prometheus client (see debug-log)") + + except ChangeError as e: + logger.error("Failed to install prometheus client: %s", str(e)) + self.unit.status = BlockedStatus("Failed to install prometheus client (see debug-log)") + + def _metrics_exporter(self): + """Generate the metrics exporter script.""" + with self._metrics_exporter_script.open() as script: + return script.read() + + +if __name__ == "__main__": + main(PrometheusTesterCharm) diff --git a/tests/integration/prometheus-tester/src/metrics.py b/tests/integration/prometheus-tester/src/metrics.py new file mode 100644 index 00000000..aa45f874 --- /dev/null +++ b/tests/integration/prometheus-tester/src/metrics.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +import random +import time + +from prometheus_client import Summary, start_http_server + +# Metric that tracks time spent and number of requests made. +REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request") + + +@REQUEST_TIME.time() +def process_request(t): + """A fake function that takes a configurable amount of time to run. + + Args: + t: integer specifying amount of time that should be + spent in processing this request + """ + time.sleep(t) + + +def main(port=8000): + """Expose a metrics endpoint to prometheus.""" + start_http_server(port) + while True: + process_request(random.random()) + + +if __name__ == "__main__": + main() diff --git a/tests/integration/prometheus-tester/src/prometheus_alert_rules/cpu_overuse.rule b/tests/integration/prometheus-tester/src/prometheus_alert_rules/cpu_overuse.rule new file mode 100644 index 00000000..ae8575b3 --- /dev/null +++ b/tests/integration/prometheus-tester/src/prometheus_alert_rules/cpu_overuse.rule @@ -0,0 +1,8 @@ +alert: CPUOverUse +expr: process_cpu_seconds_total{%%juju_topology%%} > 0.12 +for: 0m +labels: + severity: Low +annotations: + summary: "Instance {{ $labels.instance }} CPU over use" + description: "{{ $labels.instance }} of job {{ $labels.job }} has used too much CPU." diff --git a/tests/integration/prometheus-tester/src/with_extra_alert_rule/cpu_overuse.rule b/tests/integration/prometheus-tester/src/with_extra_alert_rule/cpu_overuse.rule new file mode 100644 index 00000000..ae8575b3 --- /dev/null +++ b/tests/integration/prometheus-tester/src/with_extra_alert_rule/cpu_overuse.rule @@ -0,0 +1,8 @@ +alert: CPUOverUse +expr: process_cpu_seconds_total{%%juju_topology%%} > 0.12 +for: 0m +labels: + severity: Low +annotations: + summary: "Instance {{ $labels.instance }} CPU over use" + description: "{{ $labels.instance }} of job {{ $labels.job }} has used too much CPU." diff --git a/tests/integration/prometheus-tester/src/with_extra_alert_rule/target_missing.rule b/tests/integration/prometheus-tester/src/with_extra_alert_rule/target_missing.rule new file mode 100644 index 00000000..2bbd34dc --- /dev/null +++ b/tests/integration/prometheus-tester/src/with_extra_alert_rule/target_missing.rule @@ -0,0 +1,8 @@ +alert: PrometheusTargetMissing +expr: up == 0 +for: 0m +labels: + severity: critical +annotations: + summary: "Prometheus target missing (instance {{ $labels.instance }})" + description: "A Prometheus target has disappeared. An exporter might be crashed." diff --git a/tests/integration/prometheus.py b/tests/integration/prometheus.py index 49ff9dcd..cbb1e968 100644 --- a/tests/integration/prometheus.py +++ b/tests/integration/prometheus.py @@ -6,6 +6,7 @@ from typing import List, Literal import aiohttp +from prometheus_api_client import PrometheusConnect logger = logging.getLogger(__name__) @@ -14,7 +15,7 @@ class Prometheus: """A class that represents a running instance of Prometheus.""" def __init__(self, host="localhost", port=9090): - """Manage a Prometheus application. + """Utility to manage a Prometheus application. Args: host: Optional; host address of Prometheus application. @@ -26,6 +27,52 @@ def __init__(self, host="localhost", port=9090): # The default (5 min) prolongs itests unnecessarily. self.timeout = aiohttp.ClientTimeout(total=5) + async def is_ready(self) -> bool: + """Send a GET request to check readiness. + + Returns: + True if Prometheus is ready (returned 200 OK); False otherwise. + """ + url = f"{self.base_url}/-/ready" + + async with aiohttp.ClientSession(timeout=self.timeout) as session: + async with session.get(url) as response: + return response.status == 200 + + async def config(self) -> str: + """Send a GET request to get Prometheus configuration. + + Returns: + YAML config in string format or empty string + """ + url = f"{self.base_url}/api/v1/status/config" + # Response looks like this: + # { + # "status": "success", + # "data": { + # "yaml": "global:\n + # scrape_interval: 1m\n + # scrape_timeout: 10s\n + # evaluation_interval: 1m\n + # rule_files:\n + # - /etc/prometheus/rules/juju_*.rules\n + # scrape_configs:\n + # - job_name: prometheus\n + # honor_timestamps: true\n + # scrape_interval: 5s\n + # scrape_timeout: 5s\n + # metrics_path: /metrics\n + # scheme: http\n + # static_configs:\n + # - targets:\n + # - localhost:9090\n" + # } + # } + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + result = await response.json() + return result["data"]["yaml"] if result["status"] == "success" else "" + async def rules(self, rules_type: Literal["alert", "record"] = None) -> list: """Send a GET request to get Prometheus rules. @@ -43,6 +90,73 @@ async def rules(self, rules_type: Literal["alert", "record"] = None) -> list: # {"status":"success","data":{"groups":[]} return result["data"]["groups"] if result["status"] == "success" else [] + async def labels(self) -> List[str]: + """Send a GET request to get labels. + + Returns: + List of labels + """ + url = f"{self.base_url}/api/v1/labels" + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + result = await response.json() + # response looks like this: + # { + # "status": "success", + # "data": [ + # "__name__", + # "alertname", + # "alertstate", + # ... + # "juju_application", + # "juju_charm", + # "juju_model", + # "juju_model_uuid", + # ... + # "version" + # ] + # } + return result["data"] if result["status"] == "success" else [] + + async def alerts(self) -> List[dict]: + """Send a GET request to get alerts. + + Returns: + List of alerts + """ + url = f"{self.base_url}/api/v1/alerts" + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + result = await response.json() + # response looks like this: + # + # { + # "status": "success", + # "data": { + # "alerts": [ + # { + # "labels": { + # "alertname": "AlwaysFiring", + # "job": "non_existing_job", + # "juju_application": "avalanche-k8s", + # "juju_charm": "avalanche-k8s", + # "juju_model": "remotewrite", + # "juju_model_uuid": "5d2582f6-f8c9-4496-835b-675431d1fafe", + # "severity": "High" + # }, + # "annotations": { + # "description": " of job non_existing_job is firing the dummy alarm.", + # "summary": "Instance dummy alarm (always firing)" + # }, + # "state": "firing", + # "activeAt": "2022-01-13T18:53:12.808550042Z", + # "value": "1e+00" + # } + # ] + # } + # } + return result["data"]["alerts"] if result["status"] == "success" else [] + async def active_targets(self) -> List[dict]: """Send a GET request to get active scrape targets. @@ -83,3 +197,35 @@ async def active_targets(self) -> List[dict]: # } # } return result["data"]["activeTargets"] if result["status"] == "success" else [] + + async def tsdb_head_stats(self) -> dict: + """Send a GET request to get the TSDB headStats. + + Returns: + The headStats dict. + """ + url = f"{self.base_url}/api/v1/status/tsdb" + async with aiohttp.ClientSession(timeout=self.timeout) as session: + async with session.get(url) as response: + result = await response.json() + # response looks like this: + # + # { + # "status": "success", + # "data": { + # "headStats": { + # "numSeries": 610, + # "numLabelPairs": 367, + # "chunkCount": 5702, + # "minTime": 1652720232481, + # "maxTime": 1652724527481 + # }, + # "seriesCountByMetricName": [ ... ] + # ... + # } + # } + return result["data"]["headStats"] if result["status"] == "success" else {} + + async def run_promql(self, query: str, disable_ssl: bool = True) -> list: + prometheus = PrometheusConnect(url=self.base_url, disable_ssl=disable_ssl) + return prometheus.custom_query(query=query) diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index d9cc4343..1b5faa9f 100755 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -19,35 +19,28 @@ @pytest.mark.abort_on_fail -async def test_build_and_deploy(ops_test): +async def test_build_and_deploy(ops_test, grafana_agent_charm): """Build the charm-under-test and deploy it together with related charms. Assert on the unit status before any relations/configurations take place. """ # build and deploy charm from local source folder - charm_under_test = await ops_test.build_charm(".") resources = {"agent-image": METADATA["resources"]["agent-image"]["upstream-source"]} - await ops_test.model.deploy(charm_under_test, resources=resources, application_name="agent") + await ops_test.model.deploy(grafana_agent_charm, resources=resources, application_name="agent") - # due to a juju bug, occasionally some charms finish a startup sequence with "waiting for IP - # address" - # issuing dummy update_status just to trigger an event - await ops_test.model.set_config({"update-status-hook-interval": "10s"}) - - await ops_test.model.wait_for_idle(apps=["agent"], status="active", timeout=1000) + await ops_test.model.wait_for_idle( + apps=["agent"], status="active", timeout=300, idle_period=30 + ) assert ops_test.model.applications["agent"].units[0].workload_status == "active" - # effectively disable the update status from firing - await ops_test.model.set_config({"update-status-hook-interval": "60m"}) - -async def test_relating_to_loki(ops_test): +async def test_relates_to_loki(ops_test): await ops_test.model.deploy("loki-k8s", channel="edge", application_name="loki") await ops_test.model.add_relation("loki", "agent:logging-consumer") await ops_test.model.wait_for_idle(apps=["loki", "agent"], status="active", timeout=1000) -async def test_relating_to_grafana(ops_test): +async def test_has_own_dashboard(ops_test): await ops_test.model.deploy("grafana-k8s", channel="edge", application_name="grafana") await ops_test.model.add_relation("grafana", "agent:grafana-dashboard") await ops_test.model.wait_for_idle(apps=["agent", "grafana"], status="active", timeout=1000) @@ -55,12 +48,13 @@ async def test_relating_to_grafana(ops_test): assert any(dashboard["title"] == "Grafana Agent" for dashboard in dashboards) -async def test_relating_to_prometheus(ops_test): +async def test_has_own_alert_rules(ops_test): await ops_test.model.deploy( "prometheus-k8s", channel="edge", application_name="prometheus", trust=True ) await ops_test.model.wait_for_idle(apps=["prometheus"], status="active", timeout=1000) alert_rules = await get_prometheus_rules(ops_test, "prometheus", 0) + # Check we do not have alert rules in Prometheus assert len(alert_rules) == 0 diff --git a/tests/integration/test_forwards_alerts.py b/tests/integration/test_forwards_alerts.py new file mode 100755 index 00000000..92266731 --- /dev/null +++ b/tests/integration/test_forwards_alerts.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +import asyncio +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import loki_rules, oci_image, prometheus_rules + +logger = logging.getLogger(__name__) +METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) + +agent_name = "agent" +loki_name = "loki" +loki_tester_name = "loki-tester" +prometheus_name = "prometheus" +prometheus_tester_name = "prometheus-tester" + + +@pytest.mark.abort_on_fail +async def test_deploy(ops_test, grafana_agent_charm): + """Build the charm-under-test and deploy it together with related charms. + + Assert on the unit status before any relations/configurations take place. + """ + resources = {"agent-image": METADATA["resources"]["agent-image"]["upstream-source"]} + await ops_test.model.deploy( + grafana_agent_charm, resources=resources, application_name=agent_name + ) + + # due to a juju bug, occasionally some charms finish a startup sequence with "waiting for IP + # address" + # issuing dummy update_status just to trigger an event + await ops_test.model.set_config({"update-status-hook-interval": "10s"}) + + await ops_test.model.wait_for_idle(apps=[agent_name], status="active", timeout=300) + assert ops_test.model.applications[agent_name].units[0].workload_status == "active" + + +async def test_relate_to_external_apps(ops_test): + await asyncio.gather( + ops_test.model.deploy("loki-k8s", channel="edge", application_name=loki_name), + ops_test.model.deploy( + "prometheus-k8s", channel="edge", application_name=prometheus_name, trust=True + ), + ) + await asyncio.gather( + ops_test.model.add_relation(f"{loki_name}:logging", agent_name), + ops_test.model.add_relation(f"{prometheus_name}:receive-remote-write", agent_name), + ) + await ops_test.model.wait_for_idle( + apps=[loki_name, prometheus_name, agent_name], status="active", timeout=300 + ) + + +async def test_relate_to_loki_tester_and_check_alerts(ops_test, loki_tester_charm): + await ops_test.model.deploy(loki_tester_charm, application_name=loki_tester_name) + await ops_test.model.add_relation(agent_name, loki_tester_name) + await ops_test.model.wait_for_idle( + apps=[loki_tester_name, agent_name], status="active", timeout=300 + ) + + loki_alerts = await loki_rules(ops_test, loki_name) + assert len(loki_alerts) == 1 + + +async def test_relate_to_prometheus_tester_and_check_alerts(ops_test, prometheus_tester_charm): + await ops_test.model.deploy( + prometheus_tester_charm, + resources={ + "prometheus-tester-image": oci_image( + "./tests/integration/prometheus-tester/metadata.yaml", "prometheus-tester-image" + ) + }, + application_name=prometheus_tester_name, + ) + await ops_test.model.add_relation(agent_name, prometheus_tester_name) + await ops_test.model.wait_for_idle( + apps=[prometheus_tester_name, agent_name], status="active", timeout=300 + ) + + prometheus_alerts = await prometheus_rules(ops_test, prometheus_name, 0) + assert len(prometheus_alerts) == 4 diff --git a/tests/integration/test_kubectl_delete.py b/tests/integration/test_kubectl_delete.py index 28816765..072361e5 100644 --- a/tests/integration/test_kubectl_delete.py +++ b/tests/integration/test_kubectl_delete.py @@ -16,12 +16,14 @@ @pytest.mark.abort_on_fail -async def test_deploy_from_local_path(ops_test, charm_under_test): +async def test_deploy_from_local_path(ops_test, grafana_agent_charm): """Deploy the charm-under-test.""" logger.debug("deploy local charm") resources = {"agent-image": METADATA["resources"]["agent-image"]["upstream-source"]} - await ops_test.model.deploy(charm_under_test, application_name=app_name, resources=resources) + await ops_test.model.deploy( + grafana_agent_charm, application_name=app_name, resources=resources + ) await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) diff --git a/tests/integration/test_upgrade_charm.py b/tests/integration/test_upgrade_charm.py index 5089b545..df94700b 100644 --- a/tests/integration/test_upgrade_charm.py +++ b/tests/integration/test_upgrade_charm.py @@ -16,7 +16,7 @@ @pytest.mark.abort_on_fail -async def test_deploy_from_edge_and_upgrade_from_local_path(ops_test, charm_under_test): +async def test_deploy_from_edge_and_upgrade_from_local_path(ops_test, grafana_agent_charm): """Deploy from charmhub and then upgrade with the charm-under-test.""" logger.info("deploy charm from charmhub") resources = {"agent-image": METADATA["resources"]["agent-image"]["upstream-source"]} @@ -24,6 +24,8 @@ async def test_deploy_from_edge_and_upgrade_from_local_path(ops_test, charm_unde await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) - logger.info("upgrade deployed charm with local charm %s", charm_under_test) - await ops_test.model.applications[app_name].refresh(path=charm_under_test, resources=resources) + logger.info("upgrade deployed charm with local charm %s", grafana_agent_charm) + await ops_test.model.applications[app_name].refresh( + path=grafana_agent_charm, resources=resources + ) await ops_test.model.wait_for_idle(apps=[app_name], status="active", timeout=1000) diff --git a/tests/unit/test_alerts.py b/tests/unit/test_alerts.py new file mode 100644 index 00000000..e87cecaa --- /dev/null +++ b/tests/unit/test_alerts.py @@ -0,0 +1,248 @@ +# Copyright 2020 Canonical Ltd. +# See LICENSE file for licensing details. + +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import pathlib +import tempfile +import unittest +from unittest.mock import patch + +import yaml +from ops.model import Container +from ops.testing import Harness + +from charm import GrafanaAgentOperatorCharm + +PROMETHEUS_ALERT_RULES = { + "groups": [ + { + "name": "lma_f2c1b2a6-e006-11eb-ba80-0242ac130004_provider-tester_alerts", + "rules": [ + { + "alert": "CPUOverUse", + "expr": 'process_cpu_seconds_total{juju_application="provider-tester",' + 'juju_model="lma",' + 'juju_model_uuid="f2c1b2a6-e006-11eb-ba80-0242ac130004"} > 0.12', + "for": "0m", + "labels": { + "severity": "Low", + "juju_model": "lma", + "juju_model_uuid": "f2c1b2a6-e006-11eb-ba80-0242ac130004", + "juju_application": "provider-tester", + }, + "annotations": { + "summary": "Instance {{ $labels.instance }} CPU over use", + "description": "{{ $labels.instance }} of job " + "{{ $labels.job }} has used too much CPU.", + }, + }, + { + "alert": "PrometheusTargetMissing", + "expr": 'up{juju_application="provider-tester",juju_model="lma",' + 'juju_model_uuid="f2c1b2a6-e006-11eb-ba80-0242ac130004"} == 0', + "for": "0m", + "labels": { + "severity": "critical", + "juju_model": "lma", + "juju_model_uuid": "f2c1b2a6-e006-11eb-ba80-0242ac130004", + "juju_application": "provider-tester", + }, + "annotations": { + "summary": "Prometheus target missing (instance {{ $labels.instance }})", + "description": "A Prometheus target has disappeared." + "An exporter might be crashed.\n" + "VALUE = {{ $value }}\n LABELS = {{ $labels }}", + }, + }, + ], + } + ] +} + +LOKI_ALERT_RULES = { + "groups": [ + { + "name": "lma_f2c1b2a6-e006-11eb-ba80-0242ac130004_provider-tester_alerts", + "rules": [ + { + "alert": "TooManyLogMessages", + "expr": 'count_over_time({job=".+",' + 'juju_application="provider-tester",' + 'juju_model="lma",' + 'juju_model_uuid="f2c1b2a6-e006-11eb-ba80-0242ac130004"}[1m]) > 10', + "for": "0m", + "labels": { + "severity": "Low", + "juju_model": "lma", + "juju_model_uuid": "f2c1b2a6-e006-11eb-ba80-0242ac130004", + "juju_application": "provider-tester", + }, + "annotations": { + "summary": "Instance {{ $labels.instance }} CPU over use", + "description": "{{ $labels.instance }} of job " + "{{ $labels.job }} has used too much CPU.", + }, + } + ], + } + ] +} + + +@patch.object(Container, "restart", new=lambda x, y: True) +@patch("charms.observability_libs.v0.juju_topology.JujuTopology.is_valid_uuid", lambda *args: True) +class TestAlertIngestion(unittest.TestCase): + @patch("charm.KubernetesServicePatch", lambda x, y: None) + @patch("charm.METRICS_RULES_SRC_PATH", tempfile.mkdtemp()) + @patch("charm.METRICS_RULES_DEST_PATH", tempfile.mkdtemp()) + @patch("charm.LOKI_RULES_SRC_PATH", tempfile.mkdtemp()) + @patch("charm.LOKI_RULES_DEST_PATH", tempfile.mkdtemp()) + @patch( + "charms.observability_libs.v0.juju_topology.JujuTopology.is_valid_uuid", lambda *args: True + ) + def setUp(self): + self.harness = Harness(GrafanaAgentOperatorCharm) + self.addCleanup(self.harness.cleanup) + self.harness.set_model_info(name="lma", uuid="1234567890") + self.harness.set_leader(True) + self.harness.begin_with_initial_hooks() + self.metrics_path = self.harness.charm.metrics_rules_paths + self.loki_path = self.harness.charm.loki_rules_paths + self.harness.container_pebble_ready("agent") + + +class TestPrometheusRules(TestAlertIngestion): + def test_consumes_prometheus_rules(self): + rel_id = self.harness.add_relation("metrics-endpoint", "provider") + self.harness.add_relation_unit(rel_id, "provider/0") + self.harness.update_relation_data( + rel_id, "provider", {"alert_rules": json.dumps(PROMETHEUS_ALERT_RULES)} + ) + + rule_files = [f for f in pathlib.Path(self.metrics_path.dest).iterdir() if f.is_file()] + + rules = yaml.safe_load(rule_files[0].read_text()) + for group in rules["groups"]: + if group["name"].endswith("provider-tester_alerts"): + expr = group["rules"][0]["expr"] + self.assertIn("juju_model", expr) + self.assertIn("juju_model_uuid", expr) + self.assertIn("juju_application", expr) + self.assertNotIn("juju_unit", expr) + self.assertEqual( + set(group["rules"][0]["labels"]), + { + "juju_application", + "juju_model", + "juju_model_uuid", + "severity", + }, + ) + break + else: + assert False # Could not find the correct alert rule to check + + def test_forwards_prometheus_rules(self): + rel_id = self.harness.add_relation("metrics-endpoint", "provider") + self.harness.add_relation_unit(rel_id, "provider/0") + + prom_id = self.harness.add_relation("send-remote-write", "prom") + self.harness.add_relation_unit(prom_id, "prom/0") + + self.harness.update_relation_data( + rel_id, "provider", {"alert_rules": json.dumps(PROMETHEUS_ALERT_RULES)} + ) + + data = self.harness.get_relation_data(prom_id, self.harness.model.app.name) + rules = json.loads(data["alert_rules"]) + + for group in rules["groups"]: + if group["name"].endswith("provider-tester_alerts"): + expr = group["rules"][0]["expr"] + self.assertIn("juju_model", expr) + self.assertIn("juju_model_uuid", expr) + self.assertIn("juju_application", expr) + self.assertNotIn("juju_unit", expr) + self.assertEqual( + set(group["rules"][0]["labels"]), + { + "juju_application", + "juju_model", + "juju_charm", + "juju_model_uuid", + "severity", + }, + ) + break + else: + assert False # Could not find the correct alert rule to check + + +class TestLokiRules(TestAlertIngestion): + def test_consumes_loki_rules(self): + rel_id = self.harness.add_relation("logging-provider", "consumer") + self.harness.add_relation_unit(rel_id, "consumer/0") + self.harness.update_relation_data( + rel_id, "consumer", {"alert_rules": json.dumps(LOKI_ALERT_RULES)} + ) + + rule_files = [f for f in pathlib.Path(self.loki_path.dest).iterdir() if f.is_file()] + + rules = yaml.safe_load(rule_files[0].read_text()) + for group in rules["groups"]: + if group["name"].endswith("provider-tester_alerts"): + expr = group["rules"][0]["expr"] + self.assertIn("juju_model", expr) + self.assertIn("juju_model_uuid", expr) + self.assertIn("juju_application", expr) + self.assertNotIn("juju_unit", expr) + self.assertEqual( + set(group["rules"][0]["labels"]), + { + "juju_application", + "juju_model", + "juju_model_uuid", + "severity", + }, + ) + break + else: + assert False # Could not find the correct alert rule to check + + def test_forwards_loki_rules(self): + rel_id = self.harness.add_relation("logging-provider", "consumer") + self.harness.add_relation_unit(rel_id, "consumer/0") + + loki_id = self.harness.add_relation("logging-consumer", "loki") + self.harness.add_relation_unit(loki_id, "loki/0") + + self.harness.update_relation_data( + rel_id, "consumer", {"alert_rules": json.dumps(LOKI_ALERT_RULES)} + ) + + data = self.harness.get_relation_data(loki_id, self.harness.model.app.name) + rules = json.loads(data["alert_rules"]) + + for group in rules["groups"]: + if group["name"].endswith("provider-tester_alerts_alerts"): + expr = group["rules"][0]["expr"] + self.assertIn("juju_model", expr) + self.assertIn("juju_model_uuid", expr) + self.assertIn("juju_application", expr) + self.assertNotIn("juju_unit", expr) + self.assertEqual( + set(group["rules"][0]["labels"]), + { + "juju_application", + "juju_model", + "juju_model_uuid", + "juju_charm", + "severity", + }, + ) + break + else: + assert False # Could not find the correct alert rule to check diff --git a/tests/unit/test_charm.py b/tests/unit/test_scrape_configuration.py similarity index 94% rename from tests/unit/test_charm.py rename to tests/unit/test_scrape_configuration.py index 72bf6079..6171e3fb 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_scrape_configuration.py @@ -2,6 +2,7 @@ # See LICENSE file for licensing details. import json +import tempfile import unittest from typing import Any, Dict from unittest.mock import patch @@ -72,8 +73,16 @@ @patch.object(Container, "restart", new=lambda x, y: True) -class TestCharm(unittest.TestCase): +@patch("charms.observability_libs.v0.juju_topology.JujuTopology.is_valid_uuid", lambda *args: True) +class TestScrapeConfiguration(unittest.TestCase): @patch("charm.KubernetesServicePatch", lambda x, y: None) + @patch("charm.METRICS_RULES_SRC_PATH", tempfile.mkdtemp()) + @patch("charm.METRICS_RULES_DEST_PATH", tempfile.mkdtemp()) + @patch("charm.LOKI_RULES_SRC_PATH", tempfile.mkdtemp()) + @patch("charm.LOKI_RULES_DEST_PATH", tempfile.mkdtemp()) + @patch( + "charms.observability_libs.v0.juju_topology.JujuTopology.is_valid_uuid", lambda *args: True + ) def setUp(self): self.harness = Harness(GrafanaAgentOperatorCharm) self.addCleanup(self.harness.cleanup) diff --git a/tox.ini b/tox.ini index d079e48f..02b76dd5 100644 --- a/tox.ini +++ b/tox.ini @@ -54,11 +54,8 @@ commands = isort --check-only --diff {[vars]all_path} black --check --diff {[vars]all_path} -[testenv:static-{charm,lib,unit,integration}] +[testenv:static-{charm,lib}] description = Run static analysis checks -setenv = - unit: MYPYPATH = {[vars]tst_path}/unit - integration: MYPYPATH = {[vars]tst_path}/integration deps = mypy types-dataclasses @@ -67,12 +64,8 @@ deps = types-urllib3 charm: -r{toxinidir}/requirements.txt lib: git+https://github.com/canonical/operator#egg=ops - unit: {[testenv:unit]deps} - integration: {[testenv:integration]deps} commands = charm: mypy {[vars]src_path} {posargs} - unit: mypy {[vars]tst_path}/unit {posargs} - integration: mypy {[vars]tst_path}/integration {posargs} [testenv:unit] description = Run unit tests @@ -96,6 +89,7 @@ deps = asyncstdlib juju pytest + prometheus-api-client # There is a bug in 0.17.0 which causes test failures pytest-operator==0.15.0 commands =