Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lido-exporter alert rules and alertmanager for monitoring stack #425

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
10 changes: 9 additions & 1 deletion internal/monitoring/services/grafana/config/prom.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,12 @@ datasources:
# Field with internal link pointing to data source in Grafana.
# datasourceUid value can be anything, but it should be unique across all defined data source uids.
- datasourceUid: sedge-prom
name: traceID
name: traceID

- name: Alertmanager
type: alertmanager
access: proxy
url: http://alertmanager:9093
uid: sedge-alertmanager
jsonData:
implementation: "prometheus"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
global:
resolve_timeout: 5m

route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 2h
receiver: 'blank-receiver'

receivers:
- name: 'blank-receiver'
10 changes: 10 additions & 0 deletions internal/monitoring/services/prometheus/config/prometheus.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
global:
scrape_interval: 15s
evaluation_interval: 10s

rule_files:
- ./rules/*.yml

alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
10 changes: 7 additions & 3 deletions internal/monitoring/services/prometheus/dotenv.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ limitations under the License.
package prometheus

var dotEnv map[string]string = map[string]string{
"PROM_IMAGE": "prom/prometheus:v2.54.1",
"PROM_PORT": "9090",
"PROM_CONF": "./prometheus/prometheus.yml",
"PROM_IMAGE": "prom/prometheus:v2.54.1",
"PROM_PORT": "9090",
"PROM_CONF": "./prometheus/prometheus.yml",
"PROM_RULES": "./prometheus/rules",
"ALERTMANAGER_IMAGE": "prom/alertmanager:v0.27.0",
"ALERTMANAGER_CONF": "./prometheus/alertmanager/alertmanager.yml",
"ALERTMANAGER_PORT": "9093",
}
40 changes: 40 additions & 0 deletions internal/monitoring/services/prometheus/rules/lido-exporter.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
groups:
- name: Lido Exporter Penalties
rules:
- alert: InitialSlashingDetected
expr: increase(csm_onchain_penalties_total{penalty_type="initial_slashing"}[10m]) > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Initial slashing detected"
description: "An initial slashing penalty has been detected for node operator {{ $labels.node_operator_id }}."

- alert: WithdrawalPenaltyDetected
expr: increase(csm_onchain_penalties_total{penalty_type="withdrawal"}[10m]) > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Withdrawal penalty detected"
description: "A withdrawal penalty (less than 32 ETH) has been detected for node operator {{ $labels.node_operator_id }}."

- alert: ELRewardsStealingPenaltyDetected
expr: increase(csm_onchain_penalties_total{penalty_type="el_rewards_stealing"}[10m]) > 0
for: 1m
labels:
severity: warning
annotations:
summary: "EL rewards stealing penalty detected"
description: "An EL rewards stealing penalty has been detected for node operator {{ $labels.node_operator_id }}. Consider compensating the penalty."

- name: Lido Exporter Exit Requests
rules:
- alert: ExitRequestDetected
expr: csm_onchain_exit_requests_total > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Exit request detected"
description: "An exit request has been detected for node operator {{ $labels.node_operator_id }}. Prompt action is required to exit the requested validators."
khalifaa55 marked this conversation as resolved.
Show resolved Hide resolved
98 changes: 97 additions & 1 deletion internal/monitoring/services/prometheus/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package prometheus
import (
"embed"
"fmt"
"io"
"io/fs"
"net"
"net/http"
"path/filepath"
Expand All @@ -37,15 +39,34 @@ import (
//go:embed config
var config embed.FS

//go:embed rules
var rules embed.FS

//go:embed alertmanager
var alertmanager embed.FS

// Config represents the Prometheus configuration.
type Config struct {
Global GlobalConfig `yaml:"global"`
RuleFiles []string `yaml:"rule_files"`
Alerting AlertingConfig `yaml:"alerting"`
ScrapeConfigs []ScrapeConfig `yaml:"scrape_configs"`
}

// AlertingConfig represents the alerting configuration for Prometheus.
type AlertingConfig struct {
Alertmanagers []AlertmanagerConfig `yaml:"alertmanagers"`
}

// AlertmanagerConfig represents the configuration for an Alertmanager.
type AlertmanagerConfig struct {
StaticConfigs []StaticConfig `yaml:"static_configs"`
}

// GlobalConfig represents the global configuration for Prometheus.
type GlobalConfig struct {
ScrapeInterval string `yaml:"scrape_interval"`
ScrapeInterval string `yaml:"scrape_interval"`
EvaluationInterval string `yaml:"evaluation_interval"`
}

// ScrapeConfig represents the configuration for a Prometheus scrape job.
Expand Down Expand Up @@ -262,6 +283,16 @@ func (p *PrometheusService) Setup(options map[string]string) error {
return err
}

// Copy rules
if err = p.copyRules(filepath.Join("prometheus")); err != nil {
return err
}

// Copy alertmanager config
if err = p.copyAlertManagerConfig(filepath.Join("prometheus")); err != nil {
return err
}

return nil
}

Expand Down Expand Up @@ -307,3 +338,68 @@ func (p *PrometheusService) reloadConfig() error {
func (p *PrometheusService) Name() string {
return monitoring.PrometheusServiceName
}

// copyRules copy rules to $DATA_DIR/rules
func (p *PrometheusService) copyRules(dst string) (err error) {
return fs.WalkDir(rules, "rules", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() {
rule, err := rules.Open(path)
if err != nil {
return err
}
defer func() {
cerr := rule.Close()
if err == nil {
err = cerr
}
}()
data, err := io.ReadAll(rule)
if err != nil {
return err
}
if err = p.stack.WriteFile(filepath.Join(dst, path), data); err != nil {
return err
}
} else {
if err = p.stack.CreateDir(filepath.Join(dst, path)); err != nil {
return err
}
}
return nil
})
}

func (p *PrometheusService) copyAlertManagerConfig(dst string) (err error) {
return fs.WalkDir(alertmanager, "alertmanager", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() {
alertmanagerConfig, err := alertmanager.Open(path)
if err != nil {
return err
}
defer func() {
cerr := alertmanagerConfig.Close()
if err == nil {
err = cerr
}
}()
data, err := io.ReadAll(alertmanagerConfig)
if err != nil {
return err
}
if err = p.stack.WriteFile(filepath.Join(dst, path), data); err != nil {
return err
}
} else {
if err = p.stack.CreateDir(filepath.Join(dst, path)); err != nil {
return err
}
}
return nil
})
}
58 changes: 44 additions & 14 deletions internal/monitoring/services/prometheus/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,13 @@ func TestSetup(t *testing.T) {
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
for i := 0; i < 5; i++ {
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
}
return locker
}
onlyNewLocker := func(t *testing.T) *mocks.MockLocker {
Expand Down Expand Up @@ -303,6 +305,25 @@ func TestSetup(t *testing.T) {
assert.Equal(t, tt.targets[i], prom.ScrapeConfigs[i].JobName)
assert.Equal(t, tt.targets[i], prom.ScrapeConfigs[i].StaticConfigs[0].Targets[0])
}
// Check the rules
foldersToCheck := []string{
filepath.Join(basePath, "monitoring", "prometheus", "rules"),
filepath.Join(basePath, "monitoring", "prometheus", "alertmanager"),
}
filesToCheck := []string{
filepath.Join(basePath, "monitoring", "prometheus", "rules", "lido-exporter.yml"),
filepath.Join(basePath, "monitoring", "prometheus", "alertmanager", "alertmanager.yml"),
}
for _, folder := range foldersToCheck {
ok, err = afero.DirExists(afs, folder)
assert.True(t, ok)
assert.NoError(t, err)
}
for _, file := range filesToCheck {
ok, err = afero.Exists(afs, file)
assert.True(t, ok)
assert.NoError(t, err)
}
}
})
}
Expand All @@ -325,7 +346,7 @@ func TestAddTarget(t *testing.T) {
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
for i := 0; i < times*2+1; i++ {
for i := 0; i < times*2+5; i++ {
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
Expand Down Expand Up @@ -724,7 +745,7 @@ func TestRemoveTarget(t *testing.T) {
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
for i := 0; i < times*2+1; i++ {
for i := 0; i < times*2+5; i++ {
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
Expand Down Expand Up @@ -851,7 +872,7 @@ func TestRemoveTarget(t *testing.T) {
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
for i := 0; i < times+1; i++ {
for i := 0; i < times+5; i++ {
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
Expand Down Expand Up @@ -915,10 +936,15 @@ func TestRemoveTarget(t *testing.T) {
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
for i := 0; i < 5; i++ {
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
}

locker.EXPECT().Lock().Return(fmt.Errorf("error"))
return locker
},
Expand Down Expand Up @@ -947,10 +973,14 @@ func TestRemoveTarget(t *testing.T) {
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
for i := 0; i < 5; i++ {
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(true),
locker.EXPECT().Unlock().Return(nil),
)
}
gomock.InOrder(
locker.EXPECT().Lock().Return(nil),
locker.EXPECT().Locked().Return(false),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ services:
- ${GRAFANA_DATA}:/etc/grafana/data
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
- GF_INSTALL_PLUGINS=grafana-oncall-app
networks:
- sedge

Expand All @@ -25,13 +26,27 @@ services:
- ${PROM_PORT}:9090
volumes:
- ${PROM_CONF}:/etc/prometheus/prometheus.yml
- ${PROM_RULES}:/etc/prometheus/rules
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-lifecycle'
networks:
- sedge

alertmanager:
container_name: sedge_alertmanager
image: ${ALERTMANAGER_IMAGE}
restart: unless-stopped
ports:
- ${ALERTMANAGER_PORT}:9093
volumes:
- ${ALERTMANAGER_CONF}:/etc/prometheus/alertmanager/alertmanager.yml
command:
- '--config.file=/etc/prometheus/alertmanager/alertmanager.yml'
networks:
- sedge

node-exporter:
container_name: sedge_node_exporter
image: ${NODE_EXPORTER_IMAGE}
Expand Down
Loading