diff --git a/Dockerfile b/Dockerfile index 380f7c7..af37ba8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # 1st stage, build app -FROM golang:1.19 as builder +FROM golang:1.20.6-bullseye as builder RUN apt-get update && apt-get -y upgrade && apt-get install -y upx COPY . /build/app WORKDIR /build/app diff --git a/docs/README.md b/docs/README.md index 812f174..f5b0872 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,7 +10,7 @@ This is a tool for validators running tendermint nodes. It sends notifications w - [Configuration File Settings](config.md) - [Setting up PagerDuty](pagerduty.md) - [Setting up Discord](discord.md) -- TODO: [Setting up Telegram](telegram.md) +- [Setting up Telegram](telegram.md) - [Prometheus Exports](prometheus.md) - [Remotely Configuring Tenderduty](remote.md) - [Running on Akash](akash.md) @@ -79,4 +79,4 @@ Notifications: * There really isn't anything special about the notifications it sends. For Discord and Telegram it will only send an alert on a new alarm and when the alarm clears. Pagerduty has a little more nuance. * Pagerduty: * Pro-tip: the alarms sent to pagerduty all use a unique "key". Pagerduty will automatically de-deduplicate alerts based on this key. If you want redundant monitoring you can run multiple instances of tenderduty alerting to pagerduty and will not get duplicate alerts. - * Additional flapping detection is applied to pagerduty (not to discord or telegram). If a node is going up and down every few minutes it will only send an alert once in a five minute period. \ No newline at end of file + * Additional flapping detection is applied to pagerduty (not to discord or telegram). If a node is going up and down every few minutes it will only send an alert once in a five minute period. diff --git a/docs/install.md b/docs/install.md index 52ae625..91dd531 100644 --- a/docs/install.md +++ b/docs/install.md @@ -36,6 +36,7 @@ services: volumes: - home:/var/lib/tenderduty - ./config.yml:/var/lib/tenderduty/config.yml + - ./chains.d:/var/lib/tenderduty/chains.d/ logging: driver: "json-file" options: @@ -159,4 +160,4 @@ sudo systemctl start tenderduty # and to watch the logs, press CTRL-C to stop watching sudo journalctl -fu tenderduty -``` \ No newline at end of file +``` diff --git a/example-config.yml b/example-config.yml index c261f6f..b0b9a1c 100644 --- a/example-config.yml +++ b/example-config.yml @@ -67,7 +67,7 @@ chains: chain_id: osmosis-1 # Hooray, in v2 we derive the valcons from abci queries so you don't have to jump through hoops to figure out how # to convert ed25519 keys to the appropriate bech32 address. - # Use valcons address if using ICS + # Use valcons address if using ICS or tendermint/PubKeyBn254 valoper_address: osmovaloper1xxxxxxx... # Should the monitor revert to using public API endpoints if all supplied RCP nodes fail? # This isn't always reliable, not all public nodes have websocket proxying setup correctly. diff --git a/example-docker-compose.yml b/example-docker-compose.yml index 4238699..2d3ffb6 100644 --- a/example-docker-compose.yml +++ b/example-docker-compose.yml @@ -11,6 +11,7 @@ volumes: services: tenderduty: + container_name: tenderduty build: . command: "" expose: @@ -30,7 +31,8 @@ services: - monitor-net caddy: - image: caddy:2.3.0 + container_name: caddy + image: caddy:2.7.6 ports: - "80:80" - "443:443" diff --git a/td2/dashboard/server.go b/td2/dashboard/server.go index 9f0eba5..80cc8b0 100644 --- a/td2/dashboard/server.go +++ b/td2/dashboard/server.go @@ -3,8 +3,6 @@ package dash import ( "embed" "encoding/json" - "github.com/gorilla/websocket" - "github.com/textileio/go-threads/broadcast" "io/fs" "log" "net/http" @@ -12,6 +10,9 @@ import ( "sort" "sync" "time" + + "github.com/gorilla/websocket" + "github.com/textileio/go-threads/broadcast" ) var ( diff --git a/td2/dashboard/types.go b/td2/dashboard/types.go index 7e98cb4..cb96b03 100644 --- a/td2/dashboard/types.go +++ b/td2/dashboard/types.go @@ -1,20 +1,21 @@ package dash type ChainStatus struct { - MsgType string `json:"msgType"` - Name string `json:"name"` - ChainId string `json:"chain_id"` - Moniker string `json:"moniker"` - Bonded bool `json:"bonded"` - Jailed bool `json:"jailed"` - Tombstoned bool `json:"tombstoned"` - Missed int64 `json:"missed"` - Window int64 `json:"window"` - Nodes int `json:"nodes"` - HealthyNodes int `json:"healthy_nodes"` - ActiveAlerts int `json:"active_alerts"` - Height int64 `json:"height"` - LastError string `json:"last_error"` + MsgType string `json:"msgType"` + Name string `json:"name"` + ChainId string `json:"chain_id"` + Moniker string `json:"moniker"` + Bonded bool `json:"bonded"` + Jailed bool `json:"jailed"` + Tombstoned bool `json:"tombstoned"` + Missed int64 `json:"missed"` + Window int64 `json:"window"` + MinSignedPerWindow float64 `json:"min_signed_per_window"` + Nodes int `json:"nodes"` + HealthyNodes int `json:"healthy_nodes"` + ActiveAlerts int `json:"active_alerts"` + Height int64 `json:"height"` + LastError string `json:"last_error"` Blocks []int `json:"blocks"` } diff --git a/td2/rpc.go b/td2/rpc.go index b3429fe..f25ac9a 100644 --- a/td2/rpc.go +++ b/td2/rpc.go @@ -2,6 +2,7 @@ package tenderduty import ( "context" + "encoding/json" "errors" "fmt" "io" @@ -23,6 +24,7 @@ func (cc *ChainConfig) newRpc() error { for _, endpoint := range cc.Nodes { anyWorking = anyWorking || !endpoint.down } + // grab the first working endpoint tryUrl := func(u string) (msg string, down, syncing bool) { _, err := url.Parse(u) @@ -39,20 +41,28 @@ func (cc *ChainConfig) newRpc() error { down = true return } + var network string + var catching_up bool status, err := cc.client.Status(ctx) if err != nil { - msg = fmt.Sprintf("❌ could not get status for %s: (%s) %s", cc.name, u, err) - down = true - l(msg) - return + n, c, err := getStatusWithEndpoint(ctx, u) + if err != nil { + msg = fmt.Sprintf("❌ could not get status for %s: (%s) %s", cc.name, u, err) + down = true + l(msg) + return + } + network, catching_up = n, c + } else { + network, catching_up = status.NodeInfo.Network, status.SyncInfo.CatchingUp } - if status.NodeInfo.Network != cc.ChainId { - msg = fmt.Sprintf("chain id %s on %s does not match, expected %s, skipping", status.NodeInfo.Network, u, cc.ChainId) + if network != cc.ChainId { + msg = fmt.Sprintf("chain id %s on %s does not match, expected %s, skipping", network, u, cc.ChainId) down = true l(msg) return } - if status.SyncInfo.CatchingUp { + if catching_up { msg = fmt.Sprint("🐢 node is not synced, skipping ", u) syncing = true down = true @@ -97,21 +107,22 @@ func (cc *ChainConfig) newRpc() error { cc.lastError = "no usable RPC endpoints available for " + cc.ChainId if td.EnableDash { td.updateChan <- &dash.ChainStatus{ - MsgType: "status", - Name: cc.name, - ChainId: cc.ChainId, - Moniker: cc.valInfo.Moniker, - Bonded: cc.valInfo.Bonded, - Jailed: cc.valInfo.Jailed, - Tombstoned: cc.valInfo.Tombstoned, - Missed: cc.valInfo.Missed, - Window: cc.valInfo.Window, - Nodes: len(cc.Nodes), - HealthyNodes: 0, - ActiveAlerts: 1, - Height: 0, - LastError: cc.lastError, - Blocks: cc.blocksResults, + MsgType: "status", + Name: cc.name, + ChainId: cc.ChainId, + Moniker: cc.valInfo.Moniker, + Bonded: cc.valInfo.Bonded, + Jailed: cc.valInfo.Jailed, + Tombstoned: cc.valInfo.Tombstoned, + Missed: cc.valInfo.Missed, + Window: cc.valInfo.Window, + MinSignedPerWindow: cc.minSignedPerWindow, + Nodes: len(cc.Nodes), + HealthyNodes: 0, + ActiveAlerts: 1, + Height: 0, + LastError: cc.lastError, + Blocks: cc.blocksResults, } } return errors.New("no usable endpoints available for " + cc.ChainId) @@ -261,3 +272,51 @@ func guessPublicEndpoint(u string) string { } return proto + matches[1] + port } + +func getStatusWithEndpoint(ctx context.Context, u string) (string, bool, error) { + // Parse the URL + parsedURL, err := url.Parse(u) + if err != nil { + return "", false, err + } + + // Check if the scheme is 'tcp' and modify to 'http' + if parsedURL.Scheme == "tcp" { + parsedURL.Scheme = "http" + } + + queryPath := fmt.Sprintf("%s/status", parsedURL.String()) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, queryPath, nil) + if err != nil { + return "", false, err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", false, err + } + defer resp.Body.Close() + + b, err := io.ReadAll(resp.Body) + if err != nil { + return "", false, err + } + + type tendermintStatus struct { + JsonRPC string `json:"jsonrpc"` + ID int `json:"id"` + Result struct { + NodeInfo struct { + Network string `json:"network"` + } `json:"node_info"` + SyncInfo struct { + CatchingUp bool `json:"catching_up"` + } `json:"sync_info"` + } `json:"result"` + } + var status tendermintStatus + if err := json.Unmarshal(b, &status); err != nil { + return "", false, err + } + return status.Result.NodeInfo.Network, status.Result.SyncInfo.CatchingUp, nil +} diff --git a/td2/run.go b/td2/run.go index 0cce9fa..c98c6f5 100644 --- a/td2/run.go +++ b/td2/run.go @@ -107,6 +107,12 @@ func Run(configFile, stateFile, chainConfigDirectory string, password *string) e time.Sleep(5 * time.Second) continue } + + e = cc.GetMinSignedPerWindow() + if e != nil { + l("🛑", cc.ChainId, e) + } + e = cc.GetValInfo(true) if e != nil { l("🛑", cc.ChainId, e) diff --git a/td2/static/index.html b/td2/static/index.html index 7b2425a..fdffb8c 100644 --- a/td2/static/index.html +++ b/td2/static/index.html @@ -10,6 +10,7 @@ + Tenderduty Dashboard @@ -41,6 +42,7 @@ Moniker Bonded Uptime + Threshold RPC Nodes diff --git a/td2/static/status.js b/td2/static/status.js index e18ef48..5d6c750 100644 --- a/td2/static/status.js +++ b/td2/static/status.js @@ -107,6 +107,9 @@ function updateTable(status) { window += `${(100 - (status.Status[i].missed / status.Status[i].window) * 100).toFixed(2)}%` } window += `
${_.escape(status.Status[i].missed)} / ${_.escape(status.Status[i].window)}
` + + let threshold = "" + threshold += `${100 * status.Status[i].min_signed_per_window}%`; let nodes = `${_.escape(status.Status[i].healthy_nodes)} / ${_.escape(status.Status[i].nodes)}` if (status.Status[i].healthy_nodes < status.Status[i].nodes) { @@ -131,7 +134,8 @@ function updateTable(status) { } r.insertCell(4).innerHTML = `
${bonded}
` r.insertCell(5).innerHTML = `
${window}
` - r.insertCell(6).innerHTML = `
${nodes}
` + r.insertCell(6).innerHTML = `
${threshold}
` + r.insertCell(7).innerHTML = `
${nodes}
` } } diff --git a/td2/types.go b/td2/types.go index 8012949..a36840e 100644 --- a/td2/types.go +++ b/td2/types.go @@ -83,18 +83,19 @@ type savedState struct { // ChainConfig represents a validator to be monitored on a chain, it is somewhat of a misnomer since multiple // validators can be monitored on a single chain. type ChainConfig struct { - name string - wsclient *TmConn // custom websocket client to work around wss:// bugs in tendermint - client *rpchttp.HTTP // legit tendermint client - noNodes bool // tracks if all nodes are down - valInfo *ValInfo // recent validator state, only refreshed every few minutes - lastValInfo *ValInfo // use for detecting newly-jailed/tombstone - blocksResults []int - lastError string - lastBlockTime time.Time - lastBlockAlarm bool - lastBlockNum int64 - activeAlerts int + name string + wsclient *TmConn // custom websocket client to work around wss:// bugs in tendermint + client *rpchttp.HTTP // legit tendermint client + noNodes bool // tracks if all nodes are down + valInfo *ValInfo // recent validator state, only refreshed every few minutes + lastValInfo *ValInfo // use for detecting newly-jailed/tombstone + minSignedPerWindow float64 // instantly see the validator risk level + blocksResults []int + lastError string + lastBlockTime time.Time + lastBlockAlarm bool + lastBlockNum int64 + activeAlerts int statTotalSigns float64 statTotalProps float64 @@ -326,19 +327,20 @@ func validateConfig(c *Config) (fatal bool, problems []string) { } if td.EnableDash { td.updateChan <- &dash.ChainStatus{ - MsgType: "status", - Name: v.name, - ChainId: v.ChainId, - Moniker: v.valInfo.Moniker, - Bonded: v.valInfo.Bonded, - Jailed: v.valInfo.Jailed, - Tombstoned: v.valInfo.Tombstoned, - Missed: v.valInfo.Missed, - Window: v.valInfo.Window, - Nodes: len(v.Nodes), - HealthyNodes: 0, - ActiveAlerts: 0, - Blocks: v.blocksResults, + MsgType: "status", + Name: v.name, + ChainId: v.ChainId, + Moniker: v.valInfo.Moniker, + Bonded: v.valInfo.Bonded, + Jailed: v.valInfo.Jailed, + Tombstoned: v.valInfo.Tombstoned, + Missed: v.valInfo.Missed, + MinSignedPerWindow: v.minSignedPerWindow, + Window: v.valInfo.Window, + Nodes: len(v.Nodes), + HealthyNodes: 0, + ActiveAlerts: 0, + Blocks: v.blocksResults, } } } diff --git a/td2/validator.go b/td2/validator.go index ac01ce7..ad00588 100644 --- a/td2/validator.go +++ b/td2/validator.go @@ -28,6 +28,37 @@ type ValInfo struct { Valcons string `json:"valcons"` } +// GetMinSignedPerWindow The check the minimum signed threshold of the validator. +func (cc *ChainConfig) GetMinSignedPerWindow() (err error) { + if cc.client == nil { + return errors.New("nil rpc client") + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + qParams := &slashing.QueryParamsRequest{} + b, err := qParams.Marshal() + if err != nil { + return + } + resp, err := cc.client.ABCIQuery(ctx, "/cosmos.slashing.v1beta1.Query/Params", b) + if err != nil { + return + } + if resp.Response.Value == nil { + err = errors.New("🛑 could not query slashing params, got empty response") + return + } + params := &slashing.QueryParamsResponse{} + err = params.Unmarshal(resp.Response.Value) + if err != nil { + return + } + + cc.minSignedPerWindow = params.Params.MinSignedPerWindow.MustFloat64() + return +} + // GetValInfo the first bool is used to determine if extra information about the validator should be printed. func (cc *ChainConfig) GetValInfo(first bool) (err error) { if cc.client == nil { diff --git a/td2/ws.go b/td2/ws.go index ce1ef4d..f3d83e8 100644 --- a/td2/ws.go +++ b/td2/ws.go @@ -6,14 +6,15 @@ import ( "encoding/json" "errors" "fmt" - dash "github.com/blockpane/tenderduty/v2/td2/dashboard" - "github.com/gorilla/websocket" - pbtypes "github.com/tendermint/tendermint/proto/tendermint/types" "log" "net/url" "strconv" "strings" "time" + + dash "github.com/blockpane/tenderduty/v2/td2/dashboard" + "github.com/gorilla/websocket" + pbtypes "github.com/tendermint/tendermint/proto/tendermint/types" ) const ( @@ -129,6 +130,7 @@ func (cc *ChainConfig) WsRun() { cc.lastError = time.Now().UTC().String() + " " + info l(warn) } + switch signState { case Statusmissed: cc.statTotalMiss += 1 @@ -164,24 +166,26 @@ func (cc *ChainConfig) WsRun() { case cc.valInfo.Jailed: info += "- validator is jailed\n" } + cc.activeAlerts = alarms.getCount(cc.name) if td.EnableDash { td.updateChan <- &dash.ChainStatus{ - MsgType: "status", - Name: cc.name, - ChainId: cc.ChainId, - Moniker: cc.valInfo.Moniker, - Bonded: cc.valInfo.Bonded, - Jailed: cc.valInfo.Jailed, - Tombstoned: cc.valInfo.Tombstoned, - Missed: cc.valInfo.Missed, - Window: cc.valInfo.Window, - Nodes: len(cc.Nodes), - HealthyNodes: healthyNodes, - ActiveAlerts: cc.activeAlerts, - Height: update.Height, - LastError: info, - Blocks: cc.blocksResults, + MsgType: "status", + Name: cc.name, + ChainId: cc.ChainId, + Moniker: cc.valInfo.Moniker, + Bonded: cc.valInfo.Bonded, + Jailed: cc.valInfo.Jailed, + Tombstoned: cc.valInfo.Tombstoned, + Missed: cc.valInfo.Missed, + Window: cc.valInfo.Window, + MinSignedPerWindow: cc.minSignedPerWindow, + Nodes: len(cc.Nodes), + HealthyNodes: healthyNodes, + ActiveAlerts: cc.activeAlerts, + Height: update.Height, + LastError: info, + Blocks: cc.blocksResults, } }