diff --git a/example-config.yml b/example-config.yml index 7845a55..21d6e77 100644 --- a/example-config.yml +++ b/example-config.yml @@ -10,6 +10,8 @@ listen_port: 8888 hide_logs: no # How long to wait before alerting that a node is down. node_down_alert_minutes: 3 +# Node Down alert Pagerduty Severity +node_down_alert_severity: critical # Should the prometheus exporter be enabled? prometheus_enabled: yes @@ -57,7 +59,8 @@ chains: # chain_id is validated for a match when connecting to an RPC endpoint, also used as a label in several places. chain_id: osmosis-1 # Hooray, in v2 we derive the valcons from abci queries so you don't have to jump through hoops to figure out how - # to convert ed25519 keys to the appropriate bech32 address + # to convert ed25519 keys to the appropriate bech32 address. + # Use valcons address if using ICS valoper_address: osmovaloper1xxxxxxx... # Should the monitor revert to using public API endpoints if all supplied RCP nodes fail? # This isn't always reliable, not all public nodes have websocket proxying setup correctly. @@ -74,7 +77,7 @@ chains: consecutive_enabled: yes # How many missed blocks should trigger a notification? consecutive_missed: 5 - # NOT USED: future hint for pagerduty's routing + # Consecutive Missed alert Pagerduty Severity consecutive_priority: critical # For each chain there is a specific window of blocks and a percentage of missed blocks that will result in @@ -82,7 +85,7 @@ chains: percentage_enabled: no # What percentage should trigger the alert percentage_missed: 10 - # Not used yet, pagerduty routing hint + # Percentage Missed alert Pagerduty Severity percentage_priority: warning # Should an alert be sent if the validator is not in the active set ie, jailed, diff --git a/example-docker-compose.yml b/example-docker-compose.yml index a3f4f38..4238699 100644 --- a/example-docker-compose.yml +++ b/example-docker-compose.yml @@ -19,6 +19,7 @@ services: volumes: - home:/var/lib/tenderduty - ./config.yml:/var/lib/tenderduty/config.yml + - ./chains.d:/var/lib/tenderduty/chains.d/ logging: driver: "json-file" options: diff --git a/td2/alert.go b/td2/alert.go index 435aff4..0227fc7 100644 --- a/td2/alert.go +++ b/td2/alert.go @@ -5,13 +5,14 @@ import ( "context" "encoding/json" "fmt" - "github.com/PagerDuty/go-pagerduty" - tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5" "log" "net/http" "strings" "sync" "time" + + "github.com/PagerDuty/go-pagerduty" + tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5" ) type alertMsg struct { @@ -34,8 +35,8 @@ type alertMsg struct { discHook string discMentions string - slkHook string - slkMentions string + slkHook string + slkMentions string } type notifyDest uint8 @@ -206,9 +207,9 @@ func buildSlackMessage(msg *alertMsg) *SlackMessage { return &SlackMessage{ Text: msg.message, Attachments: []Attachment{ - Attachment{ - Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions), - Color: color, + { + Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions), + Color: color, }, }, } @@ -481,7 +482,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("stalled: have not seen a new block on %s in %d minutes", cc.ChainId, cc.Alerts.Stalled), - "critical", + "info", true, &cc.valInfo.Valcons, ) @@ -525,7 +526,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId), - "critical", + cc.Alerts.ConsecutivePriority, false, &id, ) @@ -537,7 +538,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId), - "critical", + "info", true, &id, ) @@ -552,7 +553,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId), - "critical", + cc.Alerts.PercentagePriority, false, &id, ) @@ -564,7 +565,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId), - "critical", + "info", false, &id, ) @@ -585,8 +586,8 @@ func (cc *ChainConfig) watch() { nodeAlarms[node.Url] = true // used to keep active alert count correct td.alert( cc.name, - fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId), - "critical", + fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId), + td.NodeDownSeverity, false, &node.Url, ) @@ -596,7 +597,7 @@ func (cc *ChainConfig) watch() { node.wasDown = false td.alert( cc.name, - fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId), + fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId), "info", true, &node.Url, diff --git a/td2/types.go b/td2/types.go index 0e98275..de72abb 100644 --- a/td2/types.go +++ b/td2/types.go @@ -48,6 +48,8 @@ type Config struct { // NodeDownMin controls how long we wait before sending an alert that a node is not responding or has // fallen behind. NodeDownMin int `yaml:"node_down_alert_minutes"` + // NodeDownSeverity controls the Pagerduty severity when notifying if a node is down. + NodeDownSeverity string `yaml:"node_down_alert_severity"` // Prom controls if the prometheus exporter is enabled. Prom bool `yaml:"prometheus_enabled"` diff --git a/td2/validator.go b/td2/validator.go index 1c5e294..ac01ce7 100644 --- a/td2/validator.go +++ b/td2/validator.go @@ -2,16 +2,18 @@ package tenderduty import ( "context" + "encoding/hex" "errors" "fmt" + "strings" + "time" + "github.com/cosmos/cosmos-sdk/crypto/keys/ed25519" "github.com/cosmos/cosmos-sdk/crypto/keys/secp256k1" "github.com/cosmos/cosmos-sdk/types/bech32" slashing "github.com/cosmos/cosmos-sdk/x/slashing/types" staking "github.com/cosmos/cosmos-sdk/x/staking/types" rpchttp "github.com/tendermint/tendermint/rpc/client/http" - "strings" - "time" ) // ValInfo holds most of the stats/info used for secondary alarms. It is refreshed roughly every minute. @@ -51,30 +53,36 @@ func (cc *ChainConfig) GetValInfo(first bool) (err error) { l(fmt.Sprintf("❌ %s (%s) is INACTIVE", cc.ValAddress, cc.valInfo.Moniker)) } - // need to know the prefix for when we serialize the slashing info query, this is too fragile. - // for now, we perform specific chain overrides based on known values because the valoper is used - // in so many places. - var prefix string - split := strings.Split(cc.ValAddress, "valoper") - if len(split) != 2 { - if pre, ok := altValopers.getAltPrefix(cc.ValAddress); ok { - cc.valInfo.Valcons, err = bech32.ConvertAndEncode(pre, cc.valInfo.Conspub[:20]) - if err != nil { + if strings.Contains(cc.ValAddress, "valcons") { + // no need to change prefix for signing info query + cc.valInfo.Valcons = cc.ValAddress + } else { + // need to know the prefix for when we serialize the slashing info query, this is too fragile. + // for now, we perform specific chain overrides based on known values because the valoper is used + // in so many places. + var prefix string + split := strings.Split(cc.ValAddress, "valoper") + if len(split) != 2 { + if pre, ok := altValopers.getAltPrefix(cc.ValAddress); ok { + cc.valInfo.Valcons, err = bech32.ConvertAndEncode(pre, cc.valInfo.Conspub[:20]) + if err != nil { + return + } + } else { + err = errors.New("❓ could not determine bech32 prefix from valoper address: " + cc.ValAddress) return } } else { - err = errors.New("❓ could not determine bech32 prefix from valoper address: " + cc.ValAddress) - return + prefix = split[0] + "valcons" + cc.valInfo.Valcons, err = bech32.ConvertAndEncode(prefix, cc.valInfo.Conspub[:20]) + if err != nil { + return + } } - } else { - prefix = split[0] + "valcons" - cc.valInfo.Valcons, err = bech32.ConvertAndEncode(prefix, cc.valInfo.Conspub[:20]) - if err != nil { - return + if first { + l("⚙️", cc.ValAddress[:20], "... is using consensus key:", cc.valInfo.Valcons) } - } - if first { - l("⚙️", cc.ValAddress[:20], "... is using consensus key:", cc.valInfo.Valcons) + } // get current signing information (tombstoned, missed block count) @@ -133,6 +141,16 @@ func (cc *ChainConfig) GetValInfo(first bool) (err error) { // getVal returns the public key, moniker, and if the validator is jailed. func getVal(ctx context.Context, client *rpchttp.HTTP, valoper string) (pub []byte, moniker string, jailed, bonded bool, err error) { + if strings.Contains(valoper, "valcons") { + _, bz, err := bech32.DecodeAndConvert(valoper) + if err != nil { + return nil, "", false, false, errors.New("could not decode and convert your address" + valoper) + } + + hexAddress := fmt.Sprintf("%X", bz) + return ToBytes(hexAddress), valoper, false, true, nil + } + q := staking.QueryValidatorRequest{ ValidatorAddr: valoper, } @@ -179,3 +197,8 @@ func getVal(ctx context.Context, client *rpchttp.HTTP, valoper string) (pub []by return pubBytes, val.Validator.GetMoniker(), val.Validator.Jailed, val.Validator.Status == 3, nil } + +func ToBytes(address string) []byte { + bz, _ := hex.DecodeString(strings.ToLower(address)) + return bz +}