diff --git a/Dockerfile b/Dockerfile index 380f7c7..af37ba8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # 1st stage, build app -FROM golang:1.19 as builder +FROM golang:1.20.6-bullseye as builder RUN apt-get update && apt-get -y upgrade && apt-get install -y upx COPY . /build/app WORKDIR /build/app diff --git a/docs/README.md b/docs/README.md index 812f174..f5b0872 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,7 +10,7 @@ This is a tool for validators running tendermint nodes. It sends notifications w - [Configuration File Settings](config.md) - [Setting up PagerDuty](pagerduty.md) - [Setting up Discord](discord.md) -- TODO: [Setting up Telegram](telegram.md) +- [Setting up Telegram](telegram.md) - [Prometheus Exports](prometheus.md) - [Remotely Configuring Tenderduty](remote.md) - [Running on Akash](akash.md) @@ -79,4 +79,4 @@ Notifications: * There really isn't anything special about the notifications it sends. For Discord and Telegram it will only send an alert on a new alarm and when the alarm clears. Pagerduty has a little more nuance. * Pagerduty: * Pro-tip: the alarms sent to pagerduty all use a unique "key". Pagerduty will automatically de-deduplicate alerts based on this key. If you want redundant monitoring you can run multiple instances of tenderduty alerting to pagerduty and will not get duplicate alerts. - * Additional flapping detection is applied to pagerduty (not to discord or telegram). If a node is going up and down every few minutes it will only send an alert once in a five minute period. \ No newline at end of file + * Additional flapping detection is applied to pagerduty (not to discord or telegram). If a node is going up and down every few minutes it will only send an alert once in a five minute period. diff --git a/docs/install.md b/docs/install.md index 52ae625..91dd531 100644 --- a/docs/install.md +++ b/docs/install.md @@ -36,6 +36,7 @@ services: volumes: - home:/var/lib/tenderduty - ./config.yml:/var/lib/tenderduty/config.yml + - ./chains.d:/var/lib/tenderduty/chains.d/ logging: driver: "json-file" options: @@ -159,4 +160,4 @@ sudo systemctl start tenderduty # and to watch the logs, press CTRL-C to stop watching sudo journalctl -fu tenderduty -``` \ No newline at end of file +``` diff --git a/example-config.yml b/example-config.yml index c261f6f..b0b9a1c 100644 --- a/example-config.yml +++ b/example-config.yml @@ -67,7 +67,7 @@ chains: chain_id: osmosis-1 # Hooray, in v2 we derive the valcons from abci queries so you don't have to jump through hoops to figure out how # to convert ed25519 keys to the appropriate bech32 address. - # Use valcons address if using ICS + # Use valcons address if using ICS or tendermint/PubKeyBn254 valoper_address: osmovaloper1xxxxxxx... # Should the monitor revert to using public API endpoints if all supplied RCP nodes fail? # This isn't always reliable, not all public nodes have websocket proxying setup correctly. diff --git a/example-docker-compose.yml b/example-docker-compose.yml index 4238699..2d3ffb6 100644 --- a/example-docker-compose.yml +++ b/example-docker-compose.yml @@ -11,6 +11,7 @@ volumes: services: tenderduty: + container_name: tenderduty build: . command: "" expose: @@ -30,7 +31,8 @@ services: - monitor-net caddy: - image: caddy:2.3.0 + container_name: caddy + image: caddy:2.7.6 ports: - "80:80" - "443:443" diff --git a/td2/dashboard/server.go b/td2/dashboard/server.go index 9f0eba5..80cc8b0 100644 --- a/td2/dashboard/server.go +++ b/td2/dashboard/server.go @@ -3,8 +3,6 @@ package dash import ( "embed" "encoding/json" - "github.com/gorilla/websocket" - "github.com/textileio/go-threads/broadcast" "io/fs" "log" "net/http" @@ -12,6 +10,9 @@ import ( "sort" "sync" "time" + + "github.com/gorilla/websocket" + "github.com/textileio/go-threads/broadcast" ) var ( diff --git a/td2/dashboard/types.go b/td2/dashboard/types.go index 7e98cb4..cb96b03 100644 --- a/td2/dashboard/types.go +++ b/td2/dashboard/types.go @@ -1,20 +1,21 @@ package dash type ChainStatus struct { - MsgType string `json:"msgType"` - Name string `json:"name"` - ChainId string `json:"chain_id"` - Moniker string `json:"moniker"` - Bonded bool `json:"bonded"` - Jailed bool `json:"jailed"` - Tombstoned bool `json:"tombstoned"` - Missed int64 `json:"missed"` - Window int64 `json:"window"` - Nodes int `json:"nodes"` - HealthyNodes int `json:"healthy_nodes"` - ActiveAlerts int `json:"active_alerts"` - Height int64 `json:"height"` - LastError string `json:"last_error"` + MsgType string `json:"msgType"` + Name string `json:"name"` + ChainId string `json:"chain_id"` + Moniker string `json:"moniker"` + Bonded bool `json:"bonded"` + Jailed bool `json:"jailed"` + Tombstoned bool `json:"tombstoned"` + Missed int64 `json:"missed"` + Window int64 `json:"window"` + MinSignedPerWindow float64 `json:"min_signed_per_window"` + Nodes int `json:"nodes"` + HealthyNodes int `json:"healthy_nodes"` + ActiveAlerts int `json:"active_alerts"` + Height int64 `json:"height"` + LastError string `json:"last_error"` Blocks []int `json:"blocks"` } diff --git a/td2/rpc.go b/td2/rpc.go index b3429fe..f25ac9a 100644 --- a/td2/rpc.go +++ b/td2/rpc.go @@ -2,6 +2,7 @@ package tenderduty import ( "context" + "encoding/json" "errors" "fmt" "io" @@ -23,6 +24,7 @@ func (cc *ChainConfig) newRpc() error { for _, endpoint := range cc.Nodes { anyWorking = anyWorking || !endpoint.down } + // grab the first working endpoint tryUrl := func(u string) (msg string, down, syncing bool) { _, err := url.Parse(u) @@ -39,20 +41,28 @@ func (cc *ChainConfig) newRpc() error { down = true return } + var network string + var catching_up bool status, err := cc.client.Status(ctx) if err != nil { - msg = fmt.Sprintf("❌ could not get status for %s: (%s) %s", cc.name, u, err) - down = true - l(msg) - return + n, c, err := getStatusWithEndpoint(ctx, u) + if err != nil { + msg = fmt.Sprintf("❌ could not get status for %s: (%s) %s", cc.name, u, err) + down = true + l(msg) + return + } + network, catching_up = n, c + } else { + network, catching_up = status.NodeInfo.Network, status.SyncInfo.CatchingUp } - if status.NodeInfo.Network != cc.ChainId { - msg = fmt.Sprintf("chain id %s on %s does not match, expected %s, skipping", status.NodeInfo.Network, u, cc.ChainId) + if network != cc.ChainId { + msg = fmt.Sprintf("chain id %s on %s does not match, expected %s, skipping", network, u, cc.ChainId) down = true l(msg) return } - if status.SyncInfo.CatchingUp { + if catching_up { msg = fmt.Sprint("🐢 node is not synced, skipping ", u) syncing = true down = true @@ -97,21 +107,22 @@ func (cc *ChainConfig) newRpc() error { cc.lastError = "no usable RPC endpoints available for " + cc.ChainId if td.EnableDash { td.updateChan <- &dash.ChainStatus{ - MsgType: "status", - Name: cc.name, - ChainId: cc.ChainId, - Moniker: cc.valInfo.Moniker, - Bonded: cc.valInfo.Bonded, - Jailed: cc.valInfo.Jailed, - Tombstoned: cc.valInfo.Tombstoned, - Missed: cc.valInfo.Missed, - Window: cc.valInfo.Window, - Nodes: len(cc.Nodes), - HealthyNodes: 0, - ActiveAlerts: 1, - Height: 0, - LastError: cc.lastError, - Blocks: cc.blocksResults, + MsgType: "status", + Name: cc.name, + ChainId: cc.ChainId, + Moniker: cc.valInfo.Moniker, + Bonded: cc.valInfo.Bonded, + Jailed: cc.valInfo.Jailed, + Tombstoned: cc.valInfo.Tombstoned, + Missed: cc.valInfo.Missed, + Window: cc.valInfo.Window, + MinSignedPerWindow: cc.minSignedPerWindow, + Nodes: len(cc.Nodes), + HealthyNodes: 0, + ActiveAlerts: 1, + Height: 0, + LastError: cc.lastError, + Blocks: cc.blocksResults, } } return errors.New("no usable endpoints available for " + cc.ChainId) @@ -261,3 +272,51 @@ func guessPublicEndpoint(u string) string { } return proto + matches[1] + port } + +func getStatusWithEndpoint(ctx context.Context, u string) (string, bool, error) { + // Parse the URL + parsedURL, err := url.Parse(u) + if err != nil { + return "", false, err + } + + // Check if the scheme is 'tcp' and modify to 'http' + if parsedURL.Scheme == "tcp" { + parsedURL.Scheme = "http" + } + + queryPath := fmt.Sprintf("%s/status", parsedURL.String()) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, queryPath, nil) + if err != nil { + return "", false, err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", false, err + } + defer resp.Body.Close() + + b, err := io.ReadAll(resp.Body) + if err != nil { + return "", false, err + } + + type tendermintStatus struct { + JsonRPC string `json:"jsonrpc"` + ID int `json:"id"` + Result struct { + NodeInfo struct { + Network string `json:"network"` + } `json:"node_info"` + SyncInfo struct { + CatchingUp bool `json:"catching_up"` + } `json:"sync_info"` + } `json:"result"` + } + var status tendermintStatus + if err := json.Unmarshal(b, &status); err != nil { + return "", false, err + } + return status.Result.NodeInfo.Network, status.Result.SyncInfo.CatchingUp, nil +} diff --git a/td2/run.go b/td2/run.go index 0cce9fa..c98c6f5 100644 --- a/td2/run.go +++ b/td2/run.go @@ -107,6 +107,12 @@ func Run(configFile, stateFile, chainConfigDirectory string, password *string) e time.Sleep(5 * time.Second) continue } + + e = cc.GetMinSignedPerWindow() + if e != nil { + l("🛑", cc.ChainId, e) + } + e = cc.GetValInfo(true) if e != nil { l("🛑", cc.ChainId, e) diff --git a/td2/static/index.html b/td2/static/index.html index 7b2425a..fdffb8c 100644 --- a/td2/static/index.html +++ b/td2/static/index.html @@ -10,6 +10,7 @@ +