Skip to content

Commit

Permalink
Merge pull request #64 from blockpane/develop
Browse files Browse the repository at this point in the history
2.2.1
  • Loading branch information
blockpane committed May 12, 2023
2 parents 12d7947 + 20efba3 commit dc491c9
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 39 deletions.
9 changes: 6 additions & 3 deletions example-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ listen_port: 8888
hide_logs: no
# How long to wait before alerting that a node is down.
node_down_alert_minutes: 3
# Node Down alert Pagerduty Severity
node_down_alert_severity: critical

# Should the prometheus exporter be enabled?
prometheus_enabled: yes
Expand Down Expand Up @@ -57,7 +59,8 @@ chains:
# chain_id is validated for a match when connecting to an RPC endpoint, also used as a label in several places.
chain_id: osmosis-1
# Hooray, in v2 we derive the valcons from abci queries so you don't have to jump through hoops to figure out how
# to convert ed25519 keys to the appropriate bech32 address
# to convert ed25519 keys to the appropriate bech32 address.
# Use valcons address if using ICS
valoper_address: osmovaloper1xxxxxxx...
# Should the monitor revert to using public API endpoints if all supplied RCP nodes fail?
# This isn't always reliable, not all public nodes have websocket proxying setup correctly.
Expand All @@ -74,15 +77,15 @@ chains:
consecutive_enabled: yes
# How many missed blocks should trigger a notification?
consecutive_missed: 5
# NOT USED: future hint for pagerduty's routing
# Consecutive Missed alert Pagerduty Severity
consecutive_priority: critical

# For each chain there is a specific window of blocks and a percentage of missed blocks that will result in
# a downtime jail infraction. Should an alert be sent if a certain percentage of this window is exceeded?
percentage_enabled: no
# What percentage should trigger the alert
percentage_missed: 10
# Not used yet, pagerduty routing hint
# Percentage Missed alert Pagerduty Severity
percentage_priority: warning

# Should an alert be sent if the validator is not in the active set ie, jailed,
Expand Down
1 change: 1 addition & 0 deletions example-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ services:
volumes:
- home:/var/lib/tenderduty
- ./config.yml:/var/lib/tenderduty/config.yml
- ./chains.d:/var/lib/tenderduty/chains.d/
logging:
driver: "json-file"
options:
Expand Down
31 changes: 16 additions & 15 deletions td2/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ import (
"context"
"encoding/json"
"fmt"
"github.com/PagerDuty/go-pagerduty"
tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
"log"
"net/http"
"strings"
"sync"
"time"

"github.com/PagerDuty/go-pagerduty"
tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
)

type alertMsg struct {
Expand All @@ -34,8 +35,8 @@ type alertMsg struct {
discHook string
discMentions string

slkHook string
slkMentions string
slkHook string
slkMentions string
}

type notifyDest uint8
Expand Down Expand Up @@ -206,9 +207,9 @@ func buildSlackMessage(msg *alertMsg) *SlackMessage {
return &SlackMessage{
Text: msg.message,
Attachments: []Attachment{
Attachment{
Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions),
Color: color,
{
Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions),
Color: color,
},
},
}
Expand Down Expand Up @@ -481,7 +482,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("stalled: have not seen a new block on %s in %d minutes", cc.ChainId, cc.Alerts.Stalled),
"critical",
"info",
true,
&cc.valInfo.Valcons,
)
Expand Down Expand Up @@ -525,7 +526,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId),
"critical",
cc.Alerts.ConsecutivePriority,
false,
&id,
)
Expand All @@ -537,7 +538,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId),
"critical",
"info",
true,
&id,
)
Expand All @@ -552,7 +553,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId),
"critical",
cc.Alerts.PercentagePriority,
false,
&id,
)
Expand All @@ -564,7 +565,7 @@ func (cc *ChainConfig) watch() {
td.alert(
cc.name,
fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId),
"critical",
"info",
false,
&id,
)
Expand All @@ -585,8 +586,8 @@ func (cc *ChainConfig) watch() {
nodeAlarms[node.Url] = true // used to keep active alert count correct
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
"critical",
fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId),
td.NodeDownSeverity,
false,
&node.Url,
)
Expand All @@ -596,7 +597,7 @@ func (cc *ChainConfig) watch() {
node.wasDown = false
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId),
"info",
true,
&node.Url,
Expand Down
2 changes: 2 additions & 0 deletions td2/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ type Config struct {
// NodeDownMin controls how long we wait before sending an alert that a node is not responding or has
// fallen behind.
NodeDownMin int `yaml:"node_down_alert_minutes"`
// NodeDownSeverity controls the Pagerduty severity when notifying if a node is down.
NodeDownSeverity string `yaml:"node_down_alert_severity"`

// Prom controls if the prometheus exporter is enabled.
Prom bool `yaml:"prometheus_enabled"`
Expand Down
65 changes: 44 additions & 21 deletions td2/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ package tenderduty

import (
"context"
"encoding/hex"
"errors"
"fmt"
"strings"
"time"

"github.com/cosmos/cosmos-sdk/crypto/keys/ed25519"
"github.com/cosmos/cosmos-sdk/crypto/keys/secp256k1"
"github.com/cosmos/cosmos-sdk/types/bech32"
slashing "github.com/cosmos/cosmos-sdk/x/slashing/types"
staking "github.com/cosmos/cosmos-sdk/x/staking/types"
rpchttp "github.com/tendermint/tendermint/rpc/client/http"
"strings"
"time"
)

// ValInfo holds most of the stats/info used for secondary alarms. It is refreshed roughly every minute.
Expand Down Expand Up @@ -51,30 +53,36 @@ func (cc *ChainConfig) GetValInfo(first bool) (err error) {
l(fmt.Sprintf("❌ %s (%s) is INACTIVE", cc.ValAddress, cc.valInfo.Moniker))
}

// need to know the prefix for when we serialize the slashing info query, this is too fragile.
// for now, we perform specific chain overrides based on known values because the valoper is used
// in so many places.
var prefix string
split := strings.Split(cc.ValAddress, "valoper")
if len(split) != 2 {
if pre, ok := altValopers.getAltPrefix(cc.ValAddress); ok {
cc.valInfo.Valcons, err = bech32.ConvertAndEncode(pre, cc.valInfo.Conspub[:20])
if err != nil {
if strings.Contains(cc.ValAddress, "valcons") {
// no need to change prefix for signing info query
cc.valInfo.Valcons = cc.ValAddress
} else {
// need to know the prefix for when we serialize the slashing info query, this is too fragile.
// for now, we perform specific chain overrides based on known values because the valoper is used
// in so many places.
var prefix string
split := strings.Split(cc.ValAddress, "valoper")
if len(split) != 2 {
if pre, ok := altValopers.getAltPrefix(cc.ValAddress); ok {
cc.valInfo.Valcons, err = bech32.ConvertAndEncode(pre, cc.valInfo.Conspub[:20])
if err != nil {
return
}
} else {
err = errors.New("❓ could not determine bech32 prefix from valoper address: " + cc.ValAddress)
return
}
} else {
err = errors.New("❓ could not determine bech32 prefix from valoper address: " + cc.ValAddress)
return
prefix = split[0] + "valcons"
cc.valInfo.Valcons, err = bech32.ConvertAndEncode(prefix, cc.valInfo.Conspub[:20])
if err != nil {
return
}
}
} else {
prefix = split[0] + "valcons"
cc.valInfo.Valcons, err = bech32.ConvertAndEncode(prefix, cc.valInfo.Conspub[:20])
if err != nil {
return
if first {
l("⚙️", cc.ValAddress[:20], "... is using consensus key:", cc.valInfo.Valcons)
}
}
if first {
l("⚙️", cc.ValAddress[:20], "... is using consensus key:", cc.valInfo.Valcons)

}

// get current signing information (tombstoned, missed block count)
Expand Down Expand Up @@ -133,6 +141,16 @@ func (cc *ChainConfig) GetValInfo(first bool) (err error) {

// getVal returns the public key, moniker, and if the validator is jailed.
func getVal(ctx context.Context, client *rpchttp.HTTP, valoper string) (pub []byte, moniker string, jailed, bonded bool, err error) {
if strings.Contains(valoper, "valcons") {
_, bz, err := bech32.DecodeAndConvert(valoper)
if err != nil {
return nil, "", false, false, errors.New("could not decode and convert your address" + valoper)
}

hexAddress := fmt.Sprintf("%X", bz)
return ToBytes(hexAddress), valoper, false, true, nil
}

q := staking.QueryValidatorRequest{
ValidatorAddr: valoper,
}
Expand Down Expand Up @@ -179,3 +197,8 @@ func getVal(ctx context.Context, client *rpchttp.HTTP, valoper string) (pub []by

return pubBytes, val.Validator.GetMoniker(), val.Validator.Jailed, val.Validator.Status == 3, nil
}

func ToBytes(address string) []byte {
bz, _ := hex.DecodeString(strings.ToLower(address))
return bz
}

0 comments on commit dc491c9

Please sign in to comment.