From 5c251851beb8434f2478c0e2c0fc7f9ab7983a73 Mon Sep 17 00:00:00 2001 From: Nico Flaig Date: Sat, 10 Feb 2024 05:16:38 +0100 Subject: [PATCH] feat: add new panels to validator client dashboard (#6415) * Add url as label request to fallbacks and score metrics * Add url as label to request errors * Add connected beacon nodes panel * Add requests to fallback nodes panel * Add REST API request errors panel --- dashboards/lodestar_validator_client.json | 349 +++++++++++++++++- packages/api/src/utils/client/httpClient.ts | 6 +- packages/api/src/utils/client/metrics.ts | 6 +- .../src/metrics/metrics/lodestar.ts | 12 +- packages/validator/src/metrics.ts | 12 +- 5 files changed, 356 insertions(+), 29 deletions(-) diff --git a/dashboards/lodestar_validator_client.json b/dashboards/lodestar_validator_client.json index 7cc41cffdb34..8ec6a04437b1 100644 --- a/dashboards/lodestar_validator_client.json +++ b/dashboards/lodestar_validator_client.json @@ -506,21 +506,121 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "A list of beacon nodes the validator client is connected to, with their last known health status. For fallback nodes, this status might be inaccurate as it is only updated if primary node is unhealthy.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "urlIndex" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "index": 0, + "text": "Primary" + } + }, + "type": "value" + }, + { + "options": { + "from": 1, + "result": { + "index": 1, + "text": "Fallback" + }, + "to": 999 + }, + "type": "range" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "red", + "index": 2, + "text": "Unhealthy" + }, + "10": { + "color": "green", + "index": 0, + "text": "Healthy" + } + }, + "type": "value" + }, + { + "options": { + "from": 1, + "result": { + "color": "yellow", + "index": 1, + "text": "Degraded" + }, + "to": 9 + }, + "type": "range" + } + ] + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + } + ] + } + ] + }, "gridPos": { "h": 4, "w": 12, "x": 12, "y": 5 }, - "id": 45, + "id": 44, "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false }, - "content": "_Validator metrics =D_", - "mode": "markdown" + "showHeader": false }, "pluginVersion": "10.1.1", "targets": [ @@ -529,10 +629,66 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "editorMode": "code", + "exemplar": false, + "expr": "vc_rest_api_client_urls_score", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, "refId": "A" } ], - "type": "text" + "title": "Connected beacon nodes", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "client_name": true, + "group": true, + "host_type": true, + "instance": true, + "job": true, + "network": true, + "scrape_location": true + }, + "indexByName": { + "Time": 3, + "Value": 2, + "__name__": 4, + "baseUrl": 0, + "client_name": 5, + "group": 6, + "host_type": 7, + "instance": 8, + "job": 9, + "network": 10, + "scrape_location": 11, + "urlIndex": 1 + }, + "renameByName": { + "Value": "Status", + "baseUrl": "URL", + "urlIndex": "Type" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Type" + } + ] + } + } + ], + "type": "table" }, { "datasource": { @@ -1100,7 +1256,7 @@ "unit": "s" } }, - "pluginVersion": "9.3.2", + "pluginVersion": "10.1.1", "reverseYBuckets": false, "targets": [ { @@ -1212,7 +1368,7 @@ "unit": "s" } }, - "pluginVersion": "9.3.2", + "pluginVersion": "10.1.1", "reverseYBuckets": false, "targets": [ { @@ -1269,6 +1425,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1349,6 +1506,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1429,6 +1587,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineStyle": { "fill": "solid" @@ -1535,6 +1694,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1615,6 +1775,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1750,7 +1911,7 @@ "unit": "s" } }, - "pluginVersion": "9.3.2", + "pluginVersion": "10.1.1", "reverseYBuckets": false, "targets": [ { @@ -1806,6 +1967,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1935,6 +2097,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -2013,6 +2176,170 @@ ], "title": "Duties reorgs / sec", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 46, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "60 * sum(rate(vc_rest_api_client_request_errors_total[$rate_interval])) by (baseUrl) > 0", + "instant": false, + "legendFormat": "{{baseUrl}}", + "range": true, + "refId": "A" + } + ], + "title": "REST API request errors (rate / min)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 73 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "60 * sum(rate(vc_rest_api_client_request_to_fallbacks_total[$rate_interval])) by (baseUrl) > 0", + "instant": false, + "legendFormat": "{{baseUrl}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests to fallback nodes (rate / min)", + "type": "timeseries" } ], "refresh": "10s", diff --git a/packages/api/src/utils/client/httpClient.ts b/packages/api/src/utils/client/httpClient.ts index 674c6ddadbe8..c0fd2b8ad1aa 100644 --- a/packages/api/src/utils/client/httpClient.ts +++ b/packages/api/src/utils/client/httpClient.ts @@ -160,7 +160,7 @@ export class HttpClient implements IHttpClient { if (metrics) { metrics.urlsScore.addCollect(() => { for (let i = 0; i < this.urlsScore.length; i++) { - metrics.urlsScore.set({urlIndex: i}, this.urlsScore[i]); + metrics.urlsScore.set({urlIndex: i, baseUrl: this.urlsOpts[i].baseUrl}, this.urlsScore[i]); } }); } @@ -211,7 +211,7 @@ export class HttpClient implements IHttpClient { const routeId = opts.routeId ?? DEFAULT_ROUTE_ID; if (i > 0) { - this.metrics?.requestToFallbacks.inc({routeId}); + this.metrics?.requestToFallbacks.inc({routeId, baseUrl}); this.logger?.debug("Requesting fallback URL", {routeId, baseUrl, score: this.urlsScore[i]}); } @@ -319,7 +319,7 @@ export class HttpClient implements IHttpClient { this.logger?.debug("HttpClient response", {routeId}); return {status: res.status, body}; } catch (e) { - this.metrics?.requestErrors.inc({routeId}); + this.metrics?.requestErrors.inc({routeId, baseUrl}); if (isAbortedError(e as Error)) { if (signalGlobal?.aborted) { diff --git a/packages/api/src/utils/client/metrics.ts b/packages/api/src/utils/client/metrics.ts index 65089e92e7ec..b326bdf3abb3 100644 --- a/packages/api/src/utils/client/metrics.ts +++ b/packages/api/src/utils/client/metrics.ts @@ -3,7 +3,7 @@ import {Gauge, GaugeExtra, Histogram} from "@lodestar/utils"; export type Metrics = { requestTime: Histogram<{routeId: string}>; streamTime: Histogram<{routeId: string}>; - requestErrors: Gauge<{routeId: string}>; - requestToFallbacks: Gauge<{routeId: string}>; - urlsScore: GaugeExtra<{urlIndex: number}>; + requestErrors: Gauge<{routeId: string; baseUrl: string}>; + requestToFallbacks: Gauge<{routeId: string; baseUrl: string}>; + urlsScore: GaugeExtra<{urlIndex: number; baseUrl: string}>; }; diff --git a/packages/beacon-node/src/metrics/metrics/lodestar.ts b/packages/beacon-node/src/metrics/metrics/lodestar.ts index 284f4e75c064..0f4940cd5d5c 100644 --- a/packages/beacon-node/src/metrics/metrics/lodestar.ts +++ b/packages/beacon-node/src/metrics/metrics/lodestar.ts @@ -1622,21 +1622,21 @@ export function createLodestarMetrics( // Provide max resolution on problematic values around 1 second buckets: [0.1, 0.5, 1, 2, 5, 15], }), - requestErrors: register.gauge<{routeId: string}>({ + requestErrors: register.gauge<{routeId: string; baseUrl: string}>({ name: "lodestar_builder_http_client_request_errors_total", help: "Total count of errors on builder http client requests by routeId", - labelNames: ["routeId"], + labelNames: ["routeId", "baseUrl"], }), - requestToFallbacks: register.gauge<{routeId: string}>({ + requestToFallbacks: register.gauge<{routeId: string; baseUrl: string}>({ name: "lodestar_builder_http_client_request_to_fallbacks_total", help: "Total count of requests to fallback URLs on builder http API by routeId", - labelNames: ["routeId"], + labelNames: ["routeId", "baseUrl"], }), - urlsScore: register.gauge<{urlIndex: number}>({ + urlsScore: register.gauge<{urlIndex: number; baseUrl: string}>({ name: "lodestar_builder_http_client_urls_score", help: "Current score of builder http URLs by url index", - labelNames: ["urlIndex"], + labelNames: ["urlIndex", "baseUrl"], }), }, diff --git a/packages/validator/src/metrics.ts b/packages/validator/src/metrics.ts index 4acf66955769..56d94318d12b 100644 --- a/packages/validator/src/metrics.ts +++ b/packages/validator/src/metrics.ts @@ -299,22 +299,22 @@ export function getMetrics(register: MetricsRegisterExtra, gitData: LodestarGitD buckets: [0.01, 0.1, 1, 2, 5], }), - requestErrors: register.gauge<{routeId: string}>({ + requestErrors: register.gauge<{routeId: string; baseUrl: string}>({ name: "vc_rest_api_client_request_errors_total", help: "Total count of errors on REST API client requests by routeId", - labelNames: ["routeId"], + labelNames: ["routeId", "baseUrl"], }), - requestToFallbacks: register.gauge<{routeId: string}>({ + requestToFallbacks: register.gauge<{routeId: string; baseUrl: string}>({ name: "vc_rest_api_client_request_to_fallbacks_total", help: "Total count of requests to fallback URLs on REST API by routeId", - labelNames: ["routeId"], + labelNames: ["routeId", "baseUrl"], }), - urlsScore: register.gauge<{urlIndex: number}>({ + urlsScore: register.gauge<{urlIndex: number; baseUrl: string}>({ name: "vc_rest_api_client_urls_score", help: "Current score of REST API URLs by url index", - labelNames: ["urlIndex"], + labelNames: ["urlIndex", "baseUrl"], }), },