From 267ffeff47b72c9ca5fc8c101b7dc9da086c435e Mon Sep 17 00:00:00 2001 From: Zander Franks Date: Mon, 1 Apr 2024 23:40:06 -0500 Subject: [PATCH 1/9] feat(metrics): serve httpsd config route from CP, other changes Signed-off-by: Zander Franks --- Cargo.lock | 13 +++++ Cargo.toml | 2 +- crates/snot-agent/src/rpc.rs | 29 +++++------ crates/snot-common/src/rpc/agent.rs | 12 +++-- crates/snot-common/src/rpc/control.rs | 2 - crates/snot/src/cli.rs | 10 ++-- crates/snot/src/env/mod.rs | 2 + crates/snot/src/server/mod.rs | 27 ++++++++-- crates/snot/src/server/prometheus.rs | 75 +++++++++++++++++++++++++++ crates/snot/src/server/rpc.rs | 4 -- crates/snot/src/state.rs | 56 +++++++++++++++----- 11 files changed, 187 insertions(+), 45 deletions(-) create mode 100644 crates/snot/src/server/prometheus.rs diff --git a/Cargo.lock b/Cargo.lock index 5299b91b..6b0a344d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -367,6 +367,7 @@ checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" dependencies = [ "async-trait", "axum-core", + "axum-macros", "base64 0.21.7", "bytes", "futures-util", @@ -440,6 +441,18 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00c055ee2d014ae5981ce1016374e8213682aa14d9bf40e48ab48b5f3ef20eaa" +dependencies = [ + "heck", + "proc-macro2", + "quote 1.0.35", + "syn 2.0.52", +] + [[package]] name = "backtrace" version = "0.3.69" diff --git a/Cargo.toml b/Cargo.toml index 329fabd6..1ffb8e99 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ resolver = "2" [workspace.dependencies] aleo-std = "=0.1.24" -axum = "0.7.5" +axum = { version = "0.7.5", features = ["macros"] } anyhow = "1" bimap = "0.6" bincode = "1.3" diff --git a/crates/snot-agent/src/rpc.rs b/crates/snot-agent/src/rpc.rs index e0f342eb..c9f6dda9 100644 --- a/crates/snot-agent/src/rpc.rs +++ b/crates/snot-agent/src/rpc.rs @@ -4,7 +4,7 @@ use snot_common::{ rpc::{ agent::{ AgentError, AgentMetric, AgentService, AgentServiceRequest, AgentServiceResponse, - ReconcileError, + Handshake, ReconcileError, }, control::{ControlServiceRequest, ControlServiceResponse}, MuxMessage, @@ -52,20 +52,19 @@ pub struct AgentRpcServer { } impl AgentService for AgentRpcServer { - async fn keep_jwt(self, _: context::Context, token: String) { - debug!("control plane delegated new JWT"); - - // cache the JWT in the state JWT mutex - self.state - .jwt - .lock() - .expect("failed to acquire JWT lock") - .replace(token.to_owned()); - - // TODO: write the JWT to a file somewhere else - tokio::fs::write(self.state.cli.path.join(JWT_FILE), token) - .await - .expect("failed to write jwt file"); + async fn handshake(self, _: context::Context, handshake: Handshake) { + if let Some(token) = handshake.jwt { + // cache the JWT in the state JWT mutex + self.state + .jwt + .lock() + .expect("failed to acquire JWT lock") + .replace(token.to_owned()); + + tokio::fs::write(self.state.cli.path.join(JWT_FILE), token) + .await + .expect("failed to write jwt file"); + } } async fn reconcile( diff --git a/crates/snot-common/src/rpc/agent.rs b/crates/snot-common/src/rpc/agent.rs index 9c25f782..d9f816a2 100644 --- a/crates/snot-common/src/rpc/agent.rs +++ b/crates/snot-common/src/rpc/agent.rs @@ -6,11 +6,16 @@ use thiserror::Error; use super::control::ResolveError; use crate::state::{AgentState, PortConfig}; +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct Handshake { + pub jwt: Option, +} + /// The RPC service that agents implement as a server. #[tarpc::service] pub trait AgentService { - /// Control plane instructs the agent to use a JWT when reconnecting later. - async fn keep_jwt(jwt: String); + /// Handshake with some initial connection details. + async fn handshake(handshake: Handshake); /// Control plane asks the agent for its external network address, along /// with local addrs. @@ -27,7 +32,8 @@ pub trait AgentService { async fn broadcast_tx(tx: String) -> Result<(), AgentError>; /// Locally execute an authorization, using the given query - /// environment id is passed so the agent can determine which aot binary to use + /// environment id is passed so the agent can determine which aot binary to + /// use async fn execute_authorization( env_id: usize, query: String, diff --git a/crates/snot-common/src/rpc/control.rs b/crates/snot-common/src/rpc/control.rs index f5a3f039..e653dfd5 100644 --- a/crates/snot-common/src/rpc/control.rs +++ b/crates/snot-common/src/rpc/control.rs @@ -10,8 +10,6 @@ use crate::state::AgentId; #[tarpc::service] pub trait ControlService { - async fn placeholder() -> String; - /// Resolve the addresses of the given agents. async fn resolve_addrs( peers: HashSet, diff --git a/crates/snot/src/cli.rs b/crates/snot/src/cli.rs index 2c1dfd1a..085c2772 100644 --- a/crates/snot/src/cli.rs +++ b/crates/snot/src/cli.rs @@ -1,15 +1,19 @@ -use std::path::PathBuf; +use std::{net::SocketAddr, path::PathBuf}; use clap::Parser; #[derive(Debug, Parser)] pub struct Cli { - #[arg(long, default_value = "1234")] /// Control plane server port + #[arg(long, default_value = "1234")] pub port: u16, - #[arg(long, default_value = "snot-control-data")] + /// Optional IP:port that a Prometheus server is running on + #[arg(long)] + pub prometheus: Option, + /// Path to the directory containing the stored data + #[arg(long, default_value = "snot-control-data")] pub path: PathBuf, #[arg(long)] diff --git a/crates/snot/src/env/mod.rs b/crates/snot/src/env/mod.rs index b26f97b4..4fbbf129 100644 --- a/crates/snot/src/env/mod.rs +++ b/crates/snot/src/env/mod.rs @@ -121,6 +121,7 @@ impl Environment { static ENVS_COUNTER: AtomicUsize = AtomicUsize::new(0); let mut state_lock = state.envs.write().await; + state.prom_httpsd.lock().unwrap().set_dirty(); let mut storage = None; let mut node_map = BiHashMap::default(); @@ -296,6 +297,7 @@ impl Environment { info!("clearing env {id} state..."); let Some(env) = ({ let mut state_lock = state.envs.write().await; + state.prom_httpsd.lock().unwrap().set_dirty(); state_lock.remove(id) }) else { bail!("env {id} not found") diff --git a/crates/snot/src/server/mod.rs b/crates/snot/src/server/mod.rs index b0928c8d..8271bb15 100644 --- a/crates/snot/src/server/mod.rs +++ b/crates/snot/src/server/mod.rs @@ -17,7 +17,10 @@ use futures_util::stream::StreamExt; use serde::Deserialize; use snot_common::{ prelude::*, - rpc::{agent::AgentServiceClient, control::ControlService}, + rpc::{ + agent::{AgentServiceClient, Handshake}, + control::ControlService, + }, }; use surrealdb::Surreal; use tarpc::server::Channel; @@ -38,6 +41,7 @@ use crate::{ mod api; mod content; pub mod jwt; +pub mod prometheus; mod rpc; pub async fn start(cli: Cli) -> Result<()> { @@ -51,11 +55,13 @@ pub async fn start(cli: Cli) -> Result<()> { storage_ids: Default::default(), storage: Default::default(), envs: Default::default(), + prom_httpsd: Default::default(), }; let app = Router::new() .route("/agent", get(agent_ws_handler)) .nest("/api/v1", api::routes()) + .nest("/prometheus", prometheus::routes()) // /env//ledger/* - ledger query service reverse proxying /mainnet/latest/stateRoot .nest("/content", content::init_routes(&state).await) .with_state(Arc::new(state)) @@ -146,6 +152,7 @@ async fn handle_socket( let id: AgentId = 'insertion: { let client = client.clone(); let mut pool = state.pool.write().await; + let mut handshake = Handshake::default(); // attempt to reconnect if claims were passed 'reconnect: { @@ -167,6 +174,15 @@ async fn handle_socket( // TODO: probably want to reconcile with old state? + // handshake with client + let client = agent.rpc().cloned().unwrap(); + tokio::spawn(async move { + // we do this in a separate task because we don't want to hold up pool insertion + if let Err(e) = client.handshake(tarpc::context::current(), handshake).await { + warn!("failed to perform client handshake: {e}"); + } + }); + break 'insertion id; } } @@ -184,12 +200,15 @@ async fn handle_socket( // create a new agent let agent = Agent::new(client.to_owned(), id, query.flags); - // sign the jwt and send it to the agent + // sign the jwt let signed_jwt = agent.sign_jwt(); + handshake.jwt = Some(signed_jwt); + + // handshake with the client tokio::spawn(async move { // we do this in a separate task because we don't want to hold up pool insertion - if let Err(e) = client.keep_jwt(tarpc::context::current(), signed_jwt).await { - warn!("failed to inform client of JWT: {e}"); + if let Err(e) = client.handshake(tarpc::context::current(), handshake).await { + warn!("failed to perform client handshake: {e}"); } }); diff --git a/crates/snot/src/server/prometheus.rs b/crates/snot/src/server/prometheus.rs new file mode 100644 index 00000000..235c2026 --- /dev/null +++ b/crates/snot/src/server/prometheus.rs @@ -0,0 +1,75 @@ +use std::{collections::HashMap, net::IpAddr}; + +use axum::{extract::State, response::IntoResponse, routing::get, Json, Router}; +use serde::Serialize; +use snot_common::state::AgentState; + +use super::AppState; +use crate::state::AgentAddrs; +pub(super) fn routes() -> Router { + Router::new().route("/httpsd", get(get_httpsd)) +} + +#[derive(Debug, Clone, Serialize)] +pub struct StaticConfig { + pub targets: [IpAddr; 1], + pub labels: HashMap, +} + +/// Caching container for the Prometheus HTTP service discovery response. Marked +/// 'dirty' when environment agents are reallocated. +#[derive(Debug, Clone, Default)] +pub enum HttpsdResponse { + #[default] + Dirty, + Clean(Vec), +} + +impl HttpsdResponse { + pub fn set_dirty(&mut self) { + *self = Self::Dirty; + } +} + +#[axum::debug_handler] +async fn get_httpsd(State(state): State) -> impl IntoResponse { + let pool = state.pool.read().await; + + let mut prom_httpsd = state.prom_httpsd.lock().unwrap(); + let static_configs = match &*prom_httpsd { + // use the cached response + HttpsdResponse::Clean(static_configs) => static_configs.to_owned(), + + // recompute the response and save it + HttpsdResponse::Dirty => { + let mut static_configs = vec![]; + + for (agent_id, agent) in pool.iter() { + let Some(agent_addr) = agent.addrs().and_then(AgentAddrs::usable) else { + continue; + }; + + match agent.state() { + AgentState::Node(env_id, _) => { + static_configs.push(StaticConfig { + targets: [agent_addr], + labels: [ + ("env_id".into(), env_id.to_string()), + ("agent_id".into(), agent_id.to_string()), + ] + .into_iter() + .collect(), + }); + } + _ => (), + } + } + + *prom_httpsd = HttpsdResponse::Clean(static_configs.to_owned()); + + static_configs + } + }; + + Json(static_configs) +} diff --git a/crates/snot/src/server/rpc.rs b/crates/snot/src/server/rpc.rs index 5e8cdc73..ea13fd22 100644 --- a/crates/snot/src/server/rpc.rs +++ b/crates/snot/src/server/rpc.rs @@ -31,10 +31,6 @@ pub struct ControlRpcServer { } impl ControlService for ControlRpcServer { - async fn placeholder(self, _: context::Context) -> String { - "Hello, world".into() - } - async fn resolve_addrs( self, _: context::Context, diff --git a/crates/snot/src/state.rs b/crates/snot/src/state.rs index 3f43cea7..05d1207b 100644 --- a/crates/snot/src/state.rs +++ b/crates/snot/src/state.rs @@ -1,7 +1,7 @@ use std::{ collections::{HashMap, HashSet}, net::IpAddr, - sync::{Arc, Weak}, + sync::{Arc, Mutex, Weak}, time::Instant, }; @@ -25,7 +25,10 @@ use crate::{ cli::Cli, env::Environment, schema::storage::LoadedStorage, - server::jwt::{Claims, JWT_NONCE, JWT_SECRET}, + server::{ + jwt::{Claims, JWT_NONCE, JWT_SECRET}, + prometheus::HttpsdResponse, + }, }; pub type AppState = Arc; @@ -41,10 +44,28 @@ pub struct GlobalState { pub storage: RwLock>>, pub envs: RwLock>>, + pub prom_httpsd: Mutex, } /// This is the representation of a public addr or a list of internal addrs. -pub type AgentAddrs = (Option, Vec); +#[derive(Debug, Clone)] +pub struct AgentAddrs { + pub external: Option, + pub internal: Vec, +} + +impl AgentAddrs { + pub fn usable(&self) -> Option { + self.external + .as_ref() + .or_else(|| self.internal.first()) + .copied() + } + + pub fn is_some(&self) -> bool { + self.external.is_some() || !self.internal.is_empty() + } +} /// An active agent, known by the control plane. #[derive(Debug)] @@ -97,11 +118,14 @@ impl Agent { /// Whether this agent is capable of being a node in the network. pub fn is_node_capable(&self) -> bool { - if !self.is_connected() || self.addrs.is_none() { + if !self.is_connected() { return false; }; - let (external, internal) = self.addrs.as_ref().unwrap(); - external.is_some() || !internal.is_empty() + + self.addrs + .as_ref() + .map(AgentAddrs::is_some) + .unwrap_or_default() } /// Check if an agent has a set of labels @@ -164,8 +188,8 @@ impl Agent { Arc::strong_count(&self.env_claim) > 1 } - /// Get a weak reference to the env claim, which can be used to later lock this - /// agent for an environment. + /// Get a weak reference to the env claim, which can be used to later lock + /// this agent for an environment. pub fn get_env_claim(&self) -> Weak { Arc::downgrade(&self.env_claim) } @@ -253,10 +277,14 @@ impl Agent { self.flags.local_pk } + pub fn addrs(&self) -> Option<&AgentAddrs> { + self.addrs.as_ref() + } + /// Set the external and internal addresses of the agent. This does **not** /// trigger a reconcile - pub fn set_addrs(&mut self, external_addr: Option, internal_addrs: Vec) { - self.addrs = Some((external_addr, internal_addrs)); + pub fn set_addrs(&mut self, external: Option, internal: Vec) { + self.addrs = Some(AgentAddrs { external, internal }); } pub fn map_to_node_state_reconcile(&self, f: F) -> Option<(AgentId, AgentClient, AgentState)> @@ -328,7 +356,9 @@ pub fn resolve_addrs( .get(&src) .ok_or_else(|| anyhow!("source agent not found"))?; - let all_internal = addr_map.values().all(|(ext, _)| ext.is_none()); + let all_internal = addr_map + .values() + .all(|AgentAddrs { external, .. }| external.is_none()); Ok(peers .iter() @@ -344,10 +374,10 @@ pub fn resolve_addrs( // if there are no external addresses in the entire addr map, // use the first internal address if all_internal { - return addrs.1.first().copied().map(|addr| (*id, addr)); + return addrs.internal.first().copied().map(|addr| (*id, addr)); } - match (src_addrs.0, addrs.0, addrs.1.first()) { + match (src_addrs.external, addrs.external, addrs.internal.first()) { // if peers have the same external address, use the first internal address (Some(src_ext), Some(peer_ext), Some(peer_int)) if src_ext == peer_ext => { Some((*id, *peer_int)) From 7d1532baa2d7cd2467a6c2e68e000a0d00e7a120 Mon Sep 17 00:00:00 2001 From: Zander Franks Date: Tue, 2 Apr 2024 18:05:52 -0500 Subject: [PATCH 2/9] feat(metrics): prometheus/grafana boilerplate, working httpsd scraping Signed-off-by: Zander Franks --- .gitignore | 1 + crates/snot/src/env/mod.rs | 4 +- crates/snot/src/server/prometheus.rs | 33 ++++-- crates/snot/src/state.rs | 4 +- scripts/metrics.sh | 3 + scripts/metrics/dashboard.json | 156 +++++++++++++++++++++++++++ scripts/metrics/docker-compose.yaml | 29 +++++ scripts/metrics/grafana.ini | 0 scripts/metrics/prometheus.yml | 18 ++++ 9 files changed, 237 insertions(+), 11 deletions(-) create mode 100755 scripts/metrics.sh create mode 100644 scripts/metrics/dashboard.json create mode 100644 scripts/metrics/docker-compose.yaml create mode 100644 scripts/metrics/grafana.ini create mode 100644 scripts/metrics/prometheus.yml diff --git a/.gitignore b/.gitignore index 3976f9e2..3c7636f3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /tests /snot-control-data /snot-data +/metrics-data diff --git a/crates/snot/src/env/mod.rs b/crates/snot/src/env/mod.rs index 4fbbf129..2923a0fa 100644 --- a/crates/snot/src/env/mod.rs +++ b/crates/snot/src/env/mod.rs @@ -121,7 +121,7 @@ impl Environment { static ENVS_COUNTER: AtomicUsize = AtomicUsize::new(0); let mut state_lock = state.envs.write().await; - state.prom_httpsd.lock().unwrap().set_dirty(); + state.prom_httpsd.lock().await.set_dirty(); let mut storage = None; let mut node_map = BiHashMap::default(); @@ -297,7 +297,7 @@ impl Environment { info!("clearing env {id} state..."); let Some(env) = ({ let mut state_lock = state.envs.write().await; - state.prom_httpsd.lock().unwrap().set_dirty(); + state.prom_httpsd.lock().await.set_dirty(); state_lock.remove(id) }) else { bail!("env {id} not found") diff --git a/crates/snot/src/server/prometheus.rs b/crates/snot/src/server/prometheus.rs index 235c2026..af7b0925 100644 --- a/crates/snot/src/server/prometheus.rs +++ b/crates/snot/src/server/prometheus.rs @@ -1,18 +1,19 @@ -use std::{collections::HashMap, net::IpAddr}; +use std::collections::HashMap; use axum::{extract::State, response::IntoResponse, routing::get, Json, Router}; use serde::Serialize; use snot_common::state::AgentState; +use tracing::{debug, info}; use super::AppState; -use crate::state::AgentAddrs; +use crate::{env::EnvPeer, state::AgentAddrs}; pub(super) fn routes() -> Router { Router::new().route("/httpsd", get(get_httpsd)) } #[derive(Debug, Clone, Serialize)] pub struct StaticConfig { - pub targets: [IpAddr; 1], + pub targets: [String; 1], pub labels: HashMap, } @@ -33,15 +34,18 @@ impl HttpsdResponse { #[axum::debug_handler] async fn get_httpsd(State(state): State) -> impl IntoResponse { - let pool = state.pool.read().await; + let mut prom_httpsd = state.prom_httpsd.lock().await; - let mut prom_httpsd = state.prom_httpsd.lock().unwrap(); let static_configs = match &*prom_httpsd { // use the cached response HttpsdResponse::Clean(static_configs) => static_configs.to_owned(), // recompute the response and save it HttpsdResponse::Dirty => { + debug!("httpsd response is dirty, regenerating..."); + let pool = state.pool.read().await; + let envs = state.envs.read().await; + let mut static_configs = vec![]; for (agent_id, agent) in pool.iter() { @@ -51,11 +55,26 @@ async fn get_httpsd(State(state): State) -> impl IntoResponse { match agent.state() { AgentState::Node(env_id, _) => { + // get the environment this agent belongs to + let Some(env) = envs.get(env_id) else { + continue; + }; + + // get the node key that corresponds to this agent + let Some(node_key) = + env.node_map.get_by_right(&EnvPeer::Internal(*agent_id)) + else { + continue; + }; + + info!("agent {} addrs: {:#?}", agent_id, agent.addrs()); + static_configs.push(StaticConfig { - targets: [agent_addr], + // targets: [format!("{agent_addr}:9000")], // TODO: metrics port + targets: ["host.docker.internal:9000".into()], // TODO: don't hard-code this :( labels: [ ("env_id".into(), env_id.to_string()), - ("agent_id".into(), agent_id.to_string()), + ("agent_id".into(), node_key.to_string()), ] .into_iter() .collect(), diff --git a/crates/snot/src/state.rs b/crates/snot/src/state.rs index 05d1207b..de35544b 100644 --- a/crates/snot/src/state.rs +++ b/crates/snot/src/state.rs @@ -1,7 +1,7 @@ use std::{ collections::{HashMap, HashSet}, net::IpAddr, - sync::{Arc, Mutex, Weak}, + sync::{Arc, Weak}, time::Instant, }; @@ -19,7 +19,7 @@ use snot_common::{ }; use surrealdb::{engine::local::Db, Surreal}; use tarpc::{client::RpcError, context}; -use tokio::sync::RwLock; +use tokio::sync::{Mutex, RwLock}; use crate::{ cli::Cli, diff --git a/scripts/metrics.sh b/scripts/metrics.sh new file mode 100755 index 00000000..3a7abf7c --- /dev/null +++ b/scripts/metrics.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +docker compose -f ./scripts/metrics/docker-compose.yaml up -d --force-recreate diff --git a/scripts/metrics/dashboard.json b/scripts/metrics/dashboard.json new file mode 100644 index 00000000..d5a6fdf4 --- /dev/null +++ b/scripts/metrics/dashboard.json @@ -0,0 +1,156 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdhk8woq2ik1sd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "cdhk8woq2ik1sd" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "avg(rate(snarkos_blocks_transactions_total{env_id=\"$env_id\", agent_id=\"$agent_id\"}[5m]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "TPS", + "type": "gauge" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "0", + "value": "0" + }, + "definition": "label_values(env_id)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "env_id", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(env_id)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "validator/0", + "value": "validator/0" + }, + "definition": "label_values(agent_id)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "agent_id", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(agent_id)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "snops dashboard", + "uid": "ddhka9opz083ke", + "version": 3, + "weekStart": "" +} diff --git a/scripts/metrics/docker-compose.yaml b/scripts/metrics/docker-compose.yaml new file mode 100644 index 00000000..51760419 --- /dev/null +++ b/scripts/metrics/docker-compose.yaml @@ -0,0 +1,29 @@ +name: snops-metrics + +services: + prometheus: + image: prom/prometheus + container_name: snops-prometheus + user: root + expose: [9090] + ports: [9090:9090] + restart: unless-stopped + # network_mode: host + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - ../../metrics-data/prometheus:/prometheus + extra_hosts: + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana-enterprise + container_name: snops-grafana + user: root + # depends_on: [prometheus] + ports: [3000:3000] + restart: unless-stopped + volumes: + - ./grafana.ini:/etc/grafana/grafana.ini + - ../../metrics-data/grafana:/var/lib/grafana + extra_hosts: + - "prometheus:host-gateway" diff --git a/scripts/metrics/grafana.ini b/scripts/metrics/grafana.ini new file mode 100644 index 00000000..e69de29b diff --git a/scripts/metrics/prometheus.yml b/scripts/metrics/prometheus.yml new file mode 100644 index 00000000..0e003b8b --- /dev/null +++ b/scripts/metrics/prometheus.yml @@ -0,0 +1,18 @@ +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 1m +scrape_configs: + - job_name: prometheus + honor_timestamps: true + metrics_path: /metrics + scheme: http + follow_redirects: true + static_configs: + - targets: [localhost:9090] + - job_name: snops + honor_timestamps: true + honor_labels: true + http_sd_configs: + - url: http://host.docker.internal:1234/prometheus/httpsd + refresh_interval: 15s From b7ed54ade6e5ef5cab09db8390daf83896d53445 Mon Sep 17 00:00:00 2001 From: Zander Franks Date: Fri, 5 Apr 2024 02:54:16 -0500 Subject: [PATCH 3/9] feat(metrics): prometheus location, promql queries, WIP outcome validation Signed-off-by: Zander Franks --- Cargo.lock | 286 +++++++++++++++++++++++--- crates/snops/Cargo.toml | 2 + crates/snops/src/cli.rs | 40 +++- crates/snops/src/env/mod.rs | 13 ++ crates/snops/src/env/timeline.rs | 45 +++- crates/snops/src/schema/outcomes.rs | 165 ++++++++++++++- crates/snops/src/server/mod.rs | 19 +- crates/snops/src/server/prometheus.rs | 39 +++- crates/snops/src/state.rs | 38 ++++ 9 files changed, 593 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0c507118..634f6da2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,7 +43,7 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "getrandom", "once_cell", "version_check", @@ -461,7 +461,7 @@ checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" dependencies = [ "addr2line", "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", "miniz_oxide", "object", @@ -715,6 +715,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "cactus" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbc26382d871df4b7442e3df10a9402bf3cf5e55cbd66f12be38861425f0564" + [[package]] name = "cc" version = "1.0.90" @@ -791,6 +797,12 @@ dependencies = [ "nom", ] +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.0" @@ -803,6 +815,20 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" +[[package]] +name = "cfgrammar" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf74ea341ae8905eac9a234b6a5a845e118c25bbbdecf85ec77431a8b3bfa0be" +dependencies = [ + "indexmap 1.9.3", + "lazy_static", + "num-traits", + "regex", + "serde", + "vob", +] + [[package]] name = "chrono" version = "0.4.35" @@ -969,7 +995,7 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -1124,7 +1150,7 @@ version = "5.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "hashbrown 0.14.3", "lock_api", "once_cell", @@ -1191,7 +1217,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "dirs-sys-next", ] @@ -1286,7 +1312,7 @@ version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -1307,6 +1333,38 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "enum-as-inner" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ffccbb6966c05b32ef8fbac435df276c4ae4d3dc55a8cd0eb9745e6c12f546a" +dependencies = [ + "heck", + "proc-macro2", + "quote 1.0.35", + "syn 2.0.52", +] + +[[package]] +name = "enum-iterator" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fd242f399be1da0a5354aa462d57b4ab2b4ee0683cc552f7c007d2d12d36e94" +dependencies = [ + "enum-iterator-derive", +] + +[[package]] +name = "enum-iterator-derive" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03cdc46ec28bd728e67540c528013c6a10eb69a02eb31078a1bda695438cbfb8" +dependencies = [ + "proc-macro2", + "quote 1.0.35", + "syn 2.0.52", +] + [[package]] name = "enum_index" version = "0.2.0" @@ -1397,6 +1455,18 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +[[package]] +name = "filetime" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "redox_syscall", + "windows-sys 0.52.0", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -1696,19 +1766,40 @@ dependencies = [ "libm", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "js-sys", "libc", "wasi", "wasm-bindgen", ] +[[package]] +name = "getset" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote 1.0.35", + "syn 1.0.109", +] + [[package]] name = "gimli" version = "0.28.1" @@ -1727,7 +1818,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68a7f542ee6b35af73b06abc0dad1c1bae89964e4e253bc4b587b91c9637867b" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "dashmap", "futures", "futures-timer", @@ -2180,7 +2271,7 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", ] [[package]] @@ -2359,7 +2450,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "windows-targets 0.52.4", ] @@ -2448,6 +2539,59 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "lrlex" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22b832738fbfa58ad036580929e973b3b6bd31c6d6c7f18f6b5ea7b626675c85" +dependencies = [ + "getopts", + "lazy_static", + "lrpar", + "num-traits", + "regex", + "serde", + "try_from", + "vergen", +] + +[[package]] +name = "lrpar" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f270b952b07995fe874b10a5ed7dd28c80aa2130e37a7de7ed667d034e0a521" +dependencies = [ + "bincode", + "cactus", + "cfgrammar", + "filetime", + "indexmap 1.9.3", + "lazy_static", + "lrtable", + "num-traits", + "packedvec", + "regex", + "serde", + "static_assertions", + "vergen", + "vob", +] + +[[package]] +name = "lrtable" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a854115c6a10772ac154261592b082436abc869c812575cadcf9d7ceda8eff0b" +dependencies = [ + "cfgrammar", + "fnv", + "num-traits", + "serde", + "sparsevec", + "static_assertions", + "vob", +] + [[package]] name = "lru" version = "0.12.3" @@ -2509,7 +2653,7 @@ version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "digest", ] @@ -2712,7 +2856,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4" dependencies = [ "bitflags 2.4.2", - "cfg-if", + "cfg-if 1.0.0", "cfg_aliases", "libc", ] @@ -2902,7 +3046,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" dependencies = [ "bitflags 2.4.2", - "cfg-if", + "cfg-if 1.0.0", "foreign-types", "libc", "once_cell", @@ -2988,6 +3132,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "packedvec" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bde3c690ec20e4a2b4fb46f0289a451181eb50011a1e2acc8d85e2fde9062a45" +dependencies = [ + "num-traits", + "serde", +] + [[package]] name = "parking" version = "2.2.0" @@ -3010,7 +3164,7 @@ version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "redox_syscall", "smallvec", @@ -3263,6 +3417,33 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prometheus-http-query" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0de773a6ba25c9164ed9d86d653a92fac759a6f0e683fd141d56bb96e80fd8b" +dependencies = [ + "enum-as-inner 0.6.0", + "mime", + "reqwest 0.11.26", + "serde", + "time", + "url", +] + +[[package]] +name = "promql-parser" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a24c16fbf55ea420c6286ef5ee86772062332d9f3b10d24a6edbc2e88840e1ad" +dependencies = [ + "cfgrammar", + "lazy_static", + "lrlex", + "lrpar", + "regex", +] + [[package]] name = "psl-types" version = "2.0.11" @@ -3681,7 +3862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", - "cfg-if", + "cfg-if 1.0.0", "getrandom", "libc", "spin 0.9.8", @@ -4130,7 +4311,7 @@ version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "cpufeatures", "digest", ] @@ -4141,7 +4322,7 @@ version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "cpufeatures", "digest", ] @@ -4655,7 +4836,7 @@ dependencies = [ "aleo-std", "anyhow", "blake2", - "cfg-if", + "cfg-if 1.0.0", "fxhash", "hashbrown 0.14.3", "hex", @@ -5273,7 +5454,7 @@ dependencies = [ "aleo-std", "anyhow", "bincode", - "cfg-if", + "cfg-if 1.0.0", "colored", "curl", "hex", @@ -5410,6 +5591,8 @@ dependencies = [ "indexmap 2.2.5", "jwt", "lazy_static", + "prometheus-http-query", + "promql-parser", "rand", "rand_chacha", "rayon", @@ -5513,6 +5696,18 @@ dependencies = [ "smallvec", ] +[[package]] +name = "sparsevec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928d1ef5df00aec8c5643c2ac37db4dd282763013c0fcc81efbb8e13db8dd8ec" +dependencies = [ + "num-traits", + "packedvec", + "serde", + "vob", +] + [[package]] name = "spin" version = "0.5.2" @@ -5560,7 +5755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" dependencies = [ "cc", - "cfg-if", + "cfg-if 1.0.0", "libc", "psm", "winapi", @@ -5894,7 +6089,7 @@ version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "fastrand 2.0.1", "rustix", "windows-sys 0.52.0", @@ -5937,7 +6132,7 @@ version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "once_cell", ] @@ -6296,9 +6491,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f7f83d1e4a0e4358ac54c5c3681e5d7da5efc5a7a632c90bb6d6669ddd9bc26" dependencies = [ "async-trait", - "cfg-if", + "cfg-if 1.0.0", "data-encoding", - "enum-as-inner", + "enum-as-inner 0.5.1", "futures-channel", "futures-io", "futures-util", @@ -6320,7 +6515,7 @@ version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aff21aa4dcefb0a1afbfac26deb0adc93888c7d295fb63ab273ef276ba2b7cfe" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "futures-util", "ipconfig", "lazy_static", @@ -6340,6 +6535,15 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "try_from" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "283d3b89e1368717881a9d51dad843cc435380d8109c9e47d38780a324698d8b" +dependencies = [ + "cfg-if 0.1.10", +] + [[package]] name = "tungstenite" version = "0.21.0" @@ -6533,12 +6737,38 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "vergen" +version = "7.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21b881cd6636ece9735721cf03c1fe1e774fe258683d084bb2812ab67435749" +dependencies = [ + "anyhow", + "cfg-if 1.0.0", + "enum-iterator", + "getset", + "rustversion", + "thiserror", + "time", +] + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "vob" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c058f4c41e71a043c67744cb76dcc1ae63ece328c1732a72489ccccc2dec23e6" +dependencies = [ + "num-traits", + "rustc_version", + "serde", +] + [[package]] name = "walkdir" version = "2.5.0" @@ -6570,7 +6800,7 @@ version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "wasm-bindgen-macro", ] @@ -6595,7 +6825,7 @@ version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "js-sys", "wasm-bindgen", "web-sys", @@ -6890,7 +7120,7 @@ version = "0.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "windows-sys 0.48.0", ] diff --git a/crates/snops/Cargo.toml b/crates/snops/Cargo.toml index fdf76672..afcb723c 100644 --- a/crates/snops/Cargo.toml +++ b/crates/snops/Cargo.toml @@ -19,6 +19,8 @@ http.workspace = true indexmap.workspace = true jwt = "0.16.0" lazy_static.workspace = true +prometheus-http-query = "0.8" +promql-parser = "0.3" rand.workspace = true rand_chacha.workspace = true rayon.workspace = true diff --git a/crates/snops/src/cli.rs b/crates/snops/src/cli.rs index f0f2e262..4a0369cf 100644 --- a/crates/snops/src/cli.rs +++ b/crates/snops/src/cli.rs @@ -1,4 +1,4 @@ -use std::{net::SocketAddr, path::PathBuf}; +use std::{fmt::Display, net::SocketAddr, path::PathBuf, str::FromStr}; use clap::Parser; @@ -12,6 +12,9 @@ pub struct Cli { #[arg(long)] pub prometheus: Option, + #[arg(long, default_value_t = PrometheusLocation::Docker)] + pub prometheus_location: PrometheusLocation, + /// Path to the directory containing the stored data #[arg(long, default_value = "snot-control-data")] pub path: PathBuf, @@ -19,3 +22,38 @@ pub struct Cli { #[arg(long)] pub hostname: Option, } + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Default)] +pub enum PrometheusLocation { + Internal, + External, + #[default] + Docker, +} + +impl Display for PrometheusLocation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use PrometheusLocation::*; + + match self { + Internal => f.write_str("internal"), + External => f.write_str("external"), + Docker => f.write_str("docker"), + } + } +} + +impl FromStr for PrometheusLocation { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + use PrometheusLocation::*; + + Ok(match s { + "internal" => Internal, + "external" => External, + "docker" => Docker, + _ => return Err("expected one of 'internal', 'external', 'docker'"), + }) + } +} diff --git a/crates/snops/src/env/mod.rs b/crates/snops/src/env/mod.rs index 0782bee8..7d8e040e 100644 --- a/crates/snops/src/env/mod.rs +++ b/crates/snops/src/env/mod.rs @@ -35,6 +35,7 @@ use crate::{ error::DeserializeError, schema::{ nodes::{ExternalNode, Node}, + outcomes::OutcomeMetrics, storage::{LoadedStorage, DEFAULT_AOT_BIN}, timeline::TimelineEvent, ItemDocument, NodeTargets, @@ -46,6 +47,9 @@ use crate::{ pub struct Environment { pub id: usize, pub storage: Arc, + + pub outcomes: OutcomeMetrics, + // TODO: pub outcome_results: RwLock, pub node_map: BiMap, pub initial_nodes: IndexMap, pub aot_bin: PathBuf, @@ -128,6 +132,7 @@ impl Environment { let mut cannon_configs = HashMap::new(); let mut tx_pipe = TxPipes::default(); let mut timeline = vec![]; + let mut outcomes: Option = None; for document in documents { match document { @@ -239,11 +244,17 @@ impl Environment { timeline.extend(sub_timeline.timeline.into_iter()); } + ItemDocument::Outcomes(sub_outcomes) => match outcomes { + Some(ref mut outcomes) => outcomes.extend(sub_outcomes.metrics.into_iter()), + None => outcomes = Some(sub_outcomes.metrics), + }, + _ => warn!("ignored unimplemented document type"), } } let storage = storage.ok_or(PrepareError::MissingStorage)?; + let outcomes = outcomes.unwrap_or_default(); // review cannon configurations to ensure all playback sources and sinks // have a real file backing them @@ -267,6 +278,8 @@ impl Environment { let env = Environment { id: env_id, storage, + outcomes, + // TODO: outcome_results: Default::default(), node_map, initial_nodes, tx_pipe, diff --git a/crates/snops/src/env/timeline.rs b/crates/snops/src/env/timeline.rs index 8ed21c7c..f3146f0c 100644 --- a/crates/snops/src/env/timeline.rs +++ b/crates/snops/src/env/timeline.rs @@ -4,9 +4,11 @@ use std::{ }; use futures_util::future::join_all; +use prometheus_http_query::response::Data; +use promql_parser::label::{MatchOp, Matcher}; use snops_common::state::{AgentId, AgentState}; use tokio::{select, sync::RwLock, task::JoinHandle}; -use tracing::{debug, error, info}; +use tracing::{debug, error, info, warn}; use super::{ error::{BatchReconcileError, ExecutionError}, @@ -288,6 +290,47 @@ impl Environment { } } + // perform outcome validation + if let Some(prometheus) = &*state.prometheus { + for (outcome_name, outcome) in env.outcomes.iter() { + // TODO: support built-in queries + let Some(mut query) = outcome.query.clone() else { + warn!("built-in queries are unsupported for probably only this commit"); + continue; + }; + + // inject env ID matchers into the PromQL query + query.add_matchers(&[Matcher { + op: MatchOp::Equal, + name: String::from("env_id"), + value: env_id.to_string(), + }]); + + // TODO: store pass/fails in environment + // TODO: prettier output + // TODO: format validation checking (show value and expected range) + + let query_response = prometheus.query(query.into_inner()).get().await; + match query_response { + Ok(result) => match result.data() { + Data::Scalar(sample) => { + let pass = outcome.validation.validate(sample.value()); + if pass { + info!("OUTCOME {outcome_name}: PASS"); + } else { + error!("OUTCOME {outcome_name}: FAIL"); + } + } + _ => warn!("unsupported prometheus query response"), + }, + + Err(e) => { + warn!("failed to validate outcome {outcome_name}: {e}"); + } + } + } + } + info!("------------------------------------------"); info!("playback of environment timeline completed"); info!("------------------------------------------"); diff --git a/crates/snops/src/schema/outcomes.rs b/crates/snops/src/schema/outcomes.rs index 5b429898..04d02a3b 100644 --- a/crates/snops/src/schema/outcomes.rs +++ b/crates/snops/src/schema/outcomes.rs @@ -1,11 +1,168 @@ use indexmap::IndexMap; -use serde::Deserialize; +use promql_parser::{label::Matcher, parser::ast::Expr as PromExpr}; +use serde::{de::Visitor, Deserialize}; + +// TODO: define built-in outcomes here or something /// A document describing a test's expected outcomes. #[derive(Deserialize, Debug, Clone)] pub struct Document { - pub metrics: Metrics, + pub metrics: OutcomeMetrics, +} + +pub type OutcomeMetrics = IndexMap; + +/// An outcome expectation; a metric/query, and a way to validate its value +/// after a timeline ends. +#[derive(Deserialize, Debug, Clone)] +pub struct OutcomeExpectation { + /// The name of the expected outcome, like `network/tps`. + pub name: String, + /// A PromQL query that will be used to verify the outcome. + /// + /// If unspecified, the metric outcome name used may refer to a built-in + /// PromQL, which will be used instead. + pub query: Option, + #[serde(flatten)] + pub validation: OutcomeValidation, + // TODO: do we want a way to only check certain agents? +} + +/// An outcome validation method. +#[derive(Deserialize, Debug, Clone)] +#[serde(untagged)] +pub enum OutcomeValidation { + /// The outcome value must be within a particular range. + Range { + /// The minimum value that the outcome value can be and pass. + min: Option, + /// The maximum value that the outcome value can be and pass. + max: Option, + }, + + /// The outcome value must be equal (or roughly equal) to a particular + /// value. + Eq { + /// A value that the outcome value must be and pass. + /// + /// Use `epsilon` to control a maximum delta between this value and + /// allowed values, so that the allowed range becomes `(eq - + /// epsilon) <= outcome <= (eq + epsilon)`. + eq: f64, + /// See `eq`. + epsilon: Option, + }, +} + +impl OutcomeValidation { + /// Validate a number given outcome validation constraints. + pub fn validate(&self, value: f64) -> bool { + match self { + Self::Range { min, max } => { + if matches!(min, Some(min) if value.lt(min)) { + return false; + } + if matches!(max, Some(max) if value.gt(max)) { + return false; + } + true + } + + Self::Eq { eq, epsilon } => { + let epsilon = epsilon.unwrap_or(f64::EPSILON); + ((eq - epsilon)..=(eq + epsilon)).contains(&value) + } + } + } +} + +/// A PromQL query. +#[derive(Debug, Clone)] +pub struct PromQuery(pub PromExpr); + +impl PromQuery { + pub fn into_inner(self) -> PromExpr { + self.0 + } + + /// Inject environment label matchers into the query. + pub fn add_matchers(&mut self, matchers: &[Matcher]) { + Self::inject_matchers(&mut self.0, matchers); + } + + fn inject_matchers(expr: &mut PromExpr, matchers: &[Matcher]) { + macro_rules! inject { + ($into:expr) => { + Self::inject_matchers(&mut $into, matchers) + }; + ($into:expr, $($into2:expr),+) => { + { + inject!($into); + inject!($($into2),+); + } + }; + } + + // TODO: we may only want to inject matchers on metrics that look like + // `snarkos_XXXX` + match expr { + PromExpr::Aggregate(expr) => inject!(expr.expr), + PromExpr::Unary(expr) => inject!(expr.expr), + PromExpr::Binary(expr) => inject!(expr.lhs, expr.rhs), + PromExpr::Paren(expr) => inject!(expr.expr), + PromExpr::Subquery(expr) => inject!(expr.expr), + PromExpr::NumberLiteral(_) => (), + PromExpr::StringLiteral(_) => (), + PromExpr::VectorSelector(selector) => { + selector.matchers.matchers.extend(matchers.iter().cloned()); + } + PromExpr::MatrixSelector(selector) => selector + .vs + .matchers + .matchers + .extend(matchers.iter().cloned()), + PromExpr::Call(call) => { + call.args + .args + .iter_mut() + .for_each(|arg| Self::inject_matchers(arg, matchers)); + } + PromExpr::Extension(_) => (), + } + } } -// TODO: this definitely needs to be a lot more specific... -pub type Metrics = IndexMap; +impl<'de> Deserialize<'de> for PromQuery { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct PromQueryVisitor; + + impl<'de> Visitor<'de> for PromQueryVisitor { + type Value = PromQuery; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a PromQL query") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + promql_parser::parser::parse(v) + .map(PromQuery) + .map_err(E::custom) + } + } + + deserializer.deserialize_str(PromQueryVisitor) + } +} + +pub type OutcomeResults<'a> = Vec>; + +pub struct OutcomeResult<'a> { + pub name: &'a str, + pub pass: bool, // TODO: need more states than pass/fail? +} diff --git a/crates/snops/src/server/mod.rs b/crates/snops/src/server/mod.rs index 55a9e603..2428c7fd 100644 --- a/crates/snops/src/server/mod.rs +++ b/crates/snops/src/server/mod.rs @@ -35,7 +35,7 @@ use crate::{ cli::Cli, logging::{log_request, req_stamp}, server::rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, - state::{Agent, AgentFlags, AppState, GlobalState}, + state::{util::OpaqueDebug, Agent, AgentFlags, AppState, GlobalState}, }; mod api; @@ -49,6 +49,11 @@ pub async fn start(cli: Cli) -> Result<(), StartError> { let mut path = cli.path.clone(); path.push("data.db"); let db = Surreal::new::(path).await?; + + let prometheus = cli + .prometheus + .and_then(|p| prometheus_http_query::Client::try_from(format!("http://{p}")).ok()); // TODO: https + let state = GlobalState { cli, db, @@ -57,6 +62,7 @@ pub async fn start(cli: Cli) -> Result<(), StartError> { storage: Default::default(), envs: Default::default(), prom_httpsd: Default::default(), + prometheus: OpaqueDebug(prometheus), }; let app = Router::new() @@ -68,17 +74,6 @@ pub async fn start(cli: Cli) -> Result<(), StartError> { .with_state(Arc::new(state)) .layer(middleware::map_response(log_request)) .layer(middleware::from_fn(req_stamp)); - // .layer( - // TraceLayer::new_for_http().make_span_with(DefaultMakeSpan::new(). - // include_headers(true)), - //.on_request(|request: &Request, _span: &Span| { - // tracing::info!("req {} - {}", request.method(), request.uri()); - //}) - //.on_response(|response: &Response, _latency: Duration, span: &Span| { - // span.record("status_code", &tracing::field::display(response.status())); - // tracing::info!("res {}", response.status()) - //}), - // ); let listener = tokio::net::TcpListener::bind("0.0.0.0:1234") .await diff --git a/crates/snops/src/server/prometheus.rs b/crates/snops/src/server/prometheus.rs index 11847160..4677402c 100644 --- a/crates/snops/src/server/prometheus.rs +++ b/crates/snops/src/server/prometheus.rs @@ -1,12 +1,12 @@ -use std::collections::HashMap; +use std::{collections::HashMap, fmt::Write}; use axum::{extract::State, response::IntoResponse, routing::get, Json, Router}; use serde::Serialize; use snops_common::state::AgentState; -use tracing::{debug, info}; +use tracing::debug; use super::AppState; -use crate::{env::EnvPeer, state::AgentAddrs}; +use crate::{cli::PrometheusLocation, env::EnvPeer}; pub(super) fn routes() -> Router { Router::new().route("/httpsd", get(get_httpsd)) } @@ -32,7 +32,6 @@ impl HttpsdResponse { } } -#[axum::debug_handler] async fn get_httpsd(State(state): State) -> impl IntoResponse { let mut prom_httpsd = state.prom_httpsd.lock().await; @@ -49,7 +48,30 @@ async fn get_httpsd(State(state): State) -> impl IntoResponse { let mut static_configs = vec![]; for (agent_id, agent) in pool.iter() { - let Some(agent_addr) = agent.addrs().and_then(AgentAddrs::usable) else { + let Some(mut agent_addr) = + (match (state.cli.prometheus_location, agent.has_label_str("local")) { + // agent is external: serve its external IP + (_, false) => agent + .addrs() + .and_then(|addrs| addrs.external.as_ref()) + .map(ToString::to_string), + + // prometheus and agent are local: use internal IP + (PrometheusLocation::Internal, true) => agent + .addrs() + .and_then(|addrs| addrs.internal.first()) + .map(ToString::to_string), + + // prometheus in docker but agent is local: use host.docker.internal + (PrometheusLocation::Docker, true) => { + Some(String::from("host.docker.internal")) + } + + // prometheus is external but agent is local: agent might not be forwarded; + // TODO + (PrometheusLocation::External, true) => continue, + }) + else { continue; }; @@ -67,11 +89,12 @@ async fn get_httpsd(State(state): State) -> impl IntoResponse { continue; }; - info!("agent {} addrs: {:#?}", agent_id, agent.addrs()); + agent_addr + .write_fmt(format_args!(":{}", agent.metrics_port())) + .unwrap(); static_configs.push(StaticConfig { - // targets: [format!("{agent_addr}:9000")], // TODO: metrics port - targets: ["host.docker.internal:9000".into()], // TODO: don't hard-code this :( + targets: [agent_addr], labels: [ ("env_id".into(), env_id.to_string()), ("agent_id".into(), node_key.to_string()), diff --git a/crates/snops/src/state.rs b/crates/snops/src/state.rs index 9e62565a..f3a609bf 100644 --- a/crates/snops/src/state.rs +++ b/crates/snops/src/state.rs @@ -8,6 +8,7 @@ use std::{ use bimap::BiMap; use fixedbitset::FixedBitSet; use jwt::SignWithKey; +use prometheus_http_query::Client as PrometheusClient; use serde::Deserialize; use snops_common::{ lasso::Spur, @@ -20,6 +21,7 @@ use surrealdb::{engine::local::Db, Surreal}; use tarpc::{client::RpcError, context}; use tokio::sync::{Mutex, RwLock}; +use self::util::OpaqueDebug; use crate::{ cli::Cli, env::Environment, @@ -44,7 +46,9 @@ pub struct GlobalState { pub storage: RwLock>>, pub envs: RwLock>>, + pub prom_httpsd: Mutex, + pub prometheus: OpaqueDebug>, } /// This is the representation of a public addr or a list of internal addrs. @@ -272,6 +276,12 @@ impl Agent { self.ports.as_ref().map(|p| p.rest).unwrap_or_default() } + /// Gets the metrics port of the agent. Assumes the agent is ready, returns + /// 0 if not. + pub fn metrics_port(&self) -> u16 { + self.ports.as_ref().map(|p| p.metrics).unwrap_or_default() + } + /// True when the agent is configured to provide its own local private key pub fn has_local_pk(&self) -> bool { self.flags.local_pk @@ -491,3 +501,31 @@ impl AgentFlags { mask } } + +pub mod util { + use std::fmt::Debug; + + /// A wrapper struct that has an "opaque" `Debug` implementation for types + /// that do not implement `Debug`. + pub struct OpaqueDebug(pub T); + + impl Debug for OpaqueDebug { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("(...)") + } + } + + impl std::ops::Deref for OpaqueDebug { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } + } + + impl std::ops::DerefMut for OpaqueDebug { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } + } +} From 546c80fc532c77a04f68c65201c791da2a07fe71 Mon Sep 17 00:00:00 2001 From: Zander Franks Date: Fri, 5 Apr 2024 20:15:02 -0500 Subject: [PATCH 4/9] feat(metrics): working promql outcome validation Signed-off-by: Zander Franks --- crates/snops-common/src/state.rs | 14 ++++---- crates/snops/src/cli.rs | 2 +- crates/snops/src/env/mod.rs | 9 ++++- crates/snops/src/env/timeline.rs | 35 +++++++++++-------- crates/snops/src/main.rs | 2 +- crates/snops/src/schema/outcomes.rs | 32 ++++++++++++++++-- scripts/metrics/prometheus.yml | 4 +-- specs/test-1tps.yaml | 52 +++++++++++++++++++++++++++++ 8 files changed, 122 insertions(+), 28 deletions(-) create mode 100644 specs/test-1tps.yaml diff --git a/crates/snops-common/src/state.rs b/crates/snops-common/src/state.rs index 0dfa87ff..3e837f13 100644 --- a/crates/snops-common/src/state.rs +++ b/crates/snops-common/src/state.rs @@ -234,8 +234,8 @@ mod strings { } /// for some reason bincode does not allow deserialize_any so if i want to allow -/// end users to type "top", 42, or "persist" i need to do have to copies of this -/// where one is not untagged. +/// end users to type "top", 42, or "persist" i need to do have to copies of +/// this where one is not untagged. /// /// bincode. please. #[derive(Debug, Copy, Default, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -248,8 +248,8 @@ pub enum DocHeightRequest { /// Set the height to the given block Absolute(u32), /// Use the same ledger as configured when the same storage was used. - /// WARNING: this may create issues if the same storage id is reused between tests - /// with different nodes. + /// WARNING: this may create issues if the same storage id is reused between + /// tests with different nodes. #[serde(with = "strings::persist")] Persist, // the control plane doesn't know the heights the nodes are at @@ -266,8 +266,8 @@ pub enum HeightRequest { /// Set the height to the given block Absolute(u32), /// Use the same ledger as configured when the same storage was used. - /// WARNING: this may create issues if the same storage id is reused between tests - /// with different nodes. + /// WARNING: this may create issues if the same storage id is reused between + /// tests with different nodes. Persist, // the control plane doesn't know the heights the nodes are at // TruncateHeight(u32), @@ -367,7 +367,7 @@ impl FromStr for NodeType { lazy_static! { static ref NODE_KEY_REGEX: Regex = Regex::new( - r"^(?Pclient|validator|prover)\/(?P[A-Za-z0-9\-]+)(?:@(?P[A-Za-z0-9\-]+))?$" + r"^(?Pclient|validator|prover)\/(?P[A-Za-z0-9\-]*)(?:@(?P[A-Za-z0-9\-]+))?$" ) .unwrap(); static ref AGENT_ID_REGEX: Regex = Regex::new(r"^[A-Za-z0-9][A-Za-z0-9\-_.]{0,63}$").unwrap(); diff --git a/crates/snops/src/cli.rs b/crates/snops/src/cli.rs index 4a0369cf..52902b0f 100644 --- a/crates/snops/src/cli.rs +++ b/crates/snops/src/cli.rs @@ -16,7 +16,7 @@ pub struct Cli { pub prometheus_location: PrometheusLocation, /// Path to the directory containing the stored data - #[arg(long, default_value = "snot-control-data")] + #[arg(long, default_value = "snops-control-data")] pub path: PathBuf, #[arg(long)] diff --git a/crates/snops/src/env/mod.rs b/crates/snops/src/env/mod.rs index 7d8e040e..3f158df3 100644 --- a/crates/snops/src/env/mod.rs +++ b/crates/snops/src/env/mod.rs @@ -158,7 +158,9 @@ impl Environment { 1 => doc_node_key.to_owned(), _ => { let mut node_key = doc_node_key.to_owned(); - node_key.id.push('-'); + if !node_key.id.is_empty() { + node_key.id.push('-'); + } node_key.id.push_str(&i.to_string()); node_key } @@ -312,6 +314,11 @@ impl Environment { .ok_or(CleanupError::EnvNotFound(*id))?; state.prom_httpsd.lock().await.set_dirty(); + // stop the timeline if it's running + if let Some(handle) = &*env.timeline_handle.lock().await { + handle.abort(); + } + // reconcile all online agents let (ids, handles): (Vec<_>, Vec<_>) = { let agents = state.pool.read().await; diff --git a/crates/snops/src/env/timeline.rs b/crates/snops/src/env/timeline.rs index f3146f0c..899c3bf4 100644 --- a/crates/snops/src/env/timeline.rs +++ b/crates/snops/src/env/timeline.rs @@ -290,6 +290,10 @@ impl Environment { } } + info!("------------------------------------------"); + info!("playback of environment timeline completed"); + info!("------------------------------------------"); + // perform outcome validation if let Some(prometheus) = &*state.prometheus { for (outcome_name, outcome) in env.outcomes.iter() { @@ -312,17 +316,24 @@ impl Environment { let query_response = prometheus.query(query.into_inner()).get().await; match query_response { - Ok(result) => match result.data() { - Data::Scalar(sample) => { - let pass = outcome.validation.validate(sample.value()); - if pass { - info!("OUTCOME {outcome_name}: PASS"); - } else { - error!("OUTCOME {outcome_name}: FAIL"); + Ok(result) => { + let value = match result.data() { + Data::Scalar(sample) => sample.value(), + Data::Vector(vector) => match vector.last() { + Some(item) => item.sample().value(), + None => { + warn!("empty vector response from prometheus"); + continue; + } + }, + _ => { + warn!("unsupported prometheus query response"); + continue; } - } - _ => warn!("unsupported prometheus query response"), - }, + }; + let message = outcome.validation.show_validation(value); + info!("OUTCOME {outcome_name}: {message}"); + } Err(e) => { warn!("failed to validate outcome {outcome_name}: {e}"); @@ -331,10 +342,6 @@ impl Environment { } } - info!("------------------------------------------"); - info!("playback of environment timeline completed"); - info!("------------------------------------------"); - Ok(()) })); diff --git a/crates/snops/src/main.rs b/crates/snops/src/main.rs index b4a96a96..50afe1a9 100644 --- a/crates/snops/src/main.rs +++ b/crates/snops/src/main.rs @@ -23,7 +23,7 @@ async fn main() { }; let env_filter = env_filter - .with_env_var("SNOT_LOG") + .with_env_var("SNOPS_LOG") .from_env_lossy() .add_directive("hyper_util=off".parse().unwrap()) .add_directive("hyper=off".parse().unwrap()) diff --git a/crates/snops/src/schema/outcomes.rs b/crates/snops/src/schema/outcomes.rs index 04d02a3b..79c39827 100644 --- a/crates/snops/src/schema/outcomes.rs +++ b/crates/snops/src/schema/outcomes.rs @@ -1,3 +1,5 @@ +use std::fmt::Display; + use indexmap::IndexMap; use promql_parser::{label::Matcher, parser::ast::Expr as PromExpr}; use serde::{de::Visitor, Deserialize}; @@ -7,6 +9,8 @@ use serde::{de::Visitor, Deserialize}; /// A document describing a test's expected outcomes. #[derive(Deserialize, Debug, Clone)] pub struct Document { + pub name: String, + pub description: Option, pub metrics: OutcomeMetrics, } @@ -16,8 +20,6 @@ pub type OutcomeMetrics = IndexMap; /// after a timeline ends. #[derive(Deserialize, Debug, Clone)] pub struct OutcomeExpectation { - /// The name of the expected outcome, like `network/tps`. - pub name: String, /// A PromQL query that will be used to verify the outcome. /// /// If unspecified, the metric outcome name used may refer to a built-in @@ -74,6 +76,32 @@ impl OutcomeValidation { } } } + + pub fn show_validation(&self, value: f64) -> String { + if self.validate(value) { + format!("✅ pass, {value} is {self}") + } else { + format!("⚠️ expected {value} to be {self}") + } + } +} + +impl Display for OutcomeValidation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use OutcomeValidation::*; + match self { + Range { min, max } => match (min, max) { + (Some(min), Some(max)) => write!(f, "between {min} and {max}"), + (Some(min), None) => write!(f, "at least {min}"), + (None, Some(max)) => write!(f, "at most {max}"), + (None, None) => write!(f, "anything"), + }, + Eq { eq, epsilon } => match epsilon { + Some(epsilon) => write!(f, "equal to {eq} ± {epsilon}"), + None => write!(f, "equal to {eq}"), + }, + } + } } /// A PromQL query. diff --git a/scripts/metrics/prometheus.yml b/scripts/metrics/prometheus.yml index 0e003b8b..a837aa40 100644 --- a/scripts/metrics/prometheus.yml +++ b/scripts/metrics/prometheus.yml @@ -1,6 +1,6 @@ global: - scrape_interval: 15s - scrape_timeout: 10s + scrape_interval: 5s + scrape_timeout: 2s evaluation_interval: 1m scrape_configs: - job_name: prometheus diff --git a/specs/test-1tps.yaml b/specs/test-1tps.yaml new file mode 100644 index 00000000..c6a06f1e --- /dev/null +++ b/specs/test-1tps.yaml @@ -0,0 +1,52 @@ +--- +version: storage.snarkos.testing.monadic.us/v1 + +id: base +name: base-ledger + +generate: + path: ./tests/base + +--- +version: nodes.snarkos.testing.monadic.us/v1 + +name: 4-validators + +nodes: + validator/: + replicas: 4 + key: committee.$ + height: 0 + validators: validator/* + peers: [] + +--- +version: cannon.snarkos.testing.monadic.us/v1 + +name: committee-tx-public +source: + file-name: txs.json +sink: + target: validator/* + tx-delay-ms: 750 + +--- +version: timeline.snarkos.testing.monadic.us/v1 + +name: tx-local + +timeline: + - cannon.await: + - name: committee-tx-public + count: 20 + - duration: 10s + +--- +version: outcomes.snarkos.testing.monadic.us/v1 + +name: 0.1-tps-outcome + +metrics: + network/tps: + query: avg(rate(snarkos_blocks_transactions_total[10s])) + min: 0.1 From 73c32aa5dccbabefc3740173dfb21bc5bfd7444c Mon Sep 17 00:00:00 2001 From: Zander Franks Date: Fri, 5 Apr 2024 22:44:06 -0500 Subject: [PATCH 5/9] feat(metrics): built-in outcome queries Signed-off-by: Zander Franks --- crates/snops/src/env/timeline.rs | 17 +++++++++----- crates/snops/src/schema/outcomes.rs | 35 ++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/crates/snops/src/env/timeline.rs b/crates/snops/src/env/timeline.rs index 899c3bf4..f5dcdd2d 100644 --- a/crates/snops/src/env/timeline.rs +++ b/crates/snops/src/env/timeline.rs @@ -20,7 +20,10 @@ use crate::{ source::{QueryTarget, TxSource}, CannonInstance, }, - schema::timeline::{Action, ActionInstance, EventDuration}, + schema::{ + outcomes::PromQuery, + timeline::{Action, ActionInstance, EventDuration}, + }, state::{Agent, AgentClient, GlobalState}, }; @@ -297,9 +300,13 @@ impl Environment { // perform outcome validation if let Some(prometheus) = &*state.prometheus { for (outcome_name, outcome) in env.outcomes.iter() { - // TODO: support built-in queries - let Some(mut query) = outcome.query.clone() else { - warn!("built-in queries are unsupported for probably only this commit"); + let Some(mut query) = outcome + .query + .as_ref() + .or_else(|| PromQuery::builtin(&outcome_name)) + .cloned() + else { + warn!("unrecognized metric name (no built-in query found)"); continue; }; @@ -311,8 +318,6 @@ impl Environment { }]); // TODO: store pass/fails in environment - // TODO: prettier output - // TODO: format validation checking (show value and expected range) let query_response = prometheus.query(query.into_inner()).get().await; match query_response { diff --git a/crates/snops/src/schema/outcomes.rs b/crates/snops/src/schema/outcomes.rs index 79c39827..47605a77 100644 --- a/crates/snops/src/schema/outcomes.rs +++ b/crates/snops/src/schema/outcomes.rs @@ -1,6 +1,7 @@ -use std::fmt::Display; +use std::{collections::HashMap, fmt::Display}; use indexmap::IndexMap; +use lazy_static::lazy_static; use promql_parser::{label::Matcher, parser::ast::Expr as PromExpr}; use serde::{de::Visitor, Deserialize}; @@ -106,9 +107,35 @@ impl Display for OutcomeValidation { /// A PromQL query. #[derive(Debug, Clone)] -pub struct PromQuery(pub PromExpr); +pub struct PromQuery(PromExpr); impl PromQuery { + /// Parse a PromQL query into a `PromQuery`. + pub fn new(query: &str) -> Result { + promql_parser::parser::parse(query).map(Self) + } + + pub fn builtin(name: &str) -> Option<&Self> { + macro_rules! builtins { + { $($name:literal : $query:literal),+ , } => { + lazy_static! { + static ref QUERY_BUILTINS: HashMap<&'static str, PromQuery> = [ + $(($name, PromQuery::new($query).unwrap())),+ + ] + .into_iter() + .collect(); + } + } + } + + builtins! { + "network/tps": "avg(rate(snarkos_blocks_transactions_total[10s]))", // TODO: time + } + + QUERY_BUILTINS.get(name) + } + + /// Fetch the inner PromQL expression from this query. pub fn into_inner(self) -> PromExpr { self.0 } @@ -178,9 +205,7 @@ impl<'de> Deserialize<'de> for PromQuery { where E: serde::de::Error, { - promql_parser::parser::parse(v) - .map(PromQuery) - .map_err(E::custom) + PromQuery::new(v).map_err(E::custom) } } From cbbcf4b6a4cc959c2f747b07084768c3b185058f Mon Sep 17 00:00:00 2001 From: Zander Date: Mon, 8 Apr 2024 11:19:48 -0500 Subject: [PATCH 6/9] chore: address PR comments --- crates/snops/src/env/timeline.rs | 2 +- crates/snops/src/schema/error.rs | 2 ++ crates/snops/src/schema/outcomes.rs | 6 ++++-- crates/snops/src/server/mod.rs | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/crates/snops/src/env/timeline.rs b/crates/snops/src/env/timeline.rs index f5dcdd2d..1d26a8c0 100644 --- a/crates/snops/src/env/timeline.rs +++ b/crates/snops/src/env/timeline.rs @@ -341,7 +341,7 @@ impl Environment { } Err(e) => { - warn!("failed to validate outcome {outcome_name}: {e}"); + error!("failed to validate outcome {outcome_name}: {e}"); } } } diff --git a/crates/snops/src/schema/error.rs b/crates/snops/src/schema/error.rs index eb365e48..63cb3381 100644 --- a/crates/snops/src/schema/error.rs +++ b/crates/snops/src/schema/error.rs @@ -69,6 +69,8 @@ pub enum SchemaError { NodeTarget(#[from] NodeTargetError), #[error("storage error: {0}")] Storage(#[from] StorageError), + #[error("query parse error: {0}")] + QueryParse(String), } impl_into_status_code!(SchemaError, |value| match value { diff --git a/crates/snops/src/schema/outcomes.rs b/crates/snops/src/schema/outcomes.rs index 47605a77..6c2ac026 100644 --- a/crates/snops/src/schema/outcomes.rs +++ b/crates/snops/src/schema/outcomes.rs @@ -111,8 +111,10 @@ pub struct PromQuery(PromExpr); impl PromQuery { /// Parse a PromQL query into a `PromQuery`. - pub fn new(query: &str) -> Result { - promql_parser::parser::parse(query).map(Self) + pub fn new(query: &str) -> Result { + promql_parser::parser::parse(query) + .map(Self) + .map_err(SchemaError::QueryParse) } pub fn builtin(name: &str) -> Option<&Self> { diff --git a/crates/snops/src/server/mod.rs b/crates/snops/src/server/mod.rs index 2428c7fd..521ee35c 100644 --- a/crates/snops/src/server/mod.rs +++ b/crates/snops/src/server/mod.rs @@ -175,6 +175,7 @@ async fn handle_socket( // TODO: probably want to reconcile with old state? // handshake with client + // unwrap safety: this agent was just `mark_connected` with a valid client let client = agent.rpc().cloned().unwrap(); tokio::spawn(async move { // we do this in a separate task because we don't want to hold up pool insertion From e7910a459a71b5ef530d106701a7d39e50c35e69 Mon Sep 17 00:00:00 2001 From: Zander Date: Mon, 8 Apr 2024 11:21:46 -0500 Subject: [PATCH 7/9] chore: one more small PR comment change --- crates/snops/src/server/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/snops/src/server/mod.rs b/crates/snops/src/server/mod.rs index 521ee35c..2fc99eb6 100644 --- a/crates/snops/src/server/mod.rs +++ b/crates/snops/src/server/mod.rs @@ -180,7 +180,7 @@ async fn handle_socket( tokio::spawn(async move { // we do this in a separate task because we don't want to hold up pool insertion if let Err(e) = client.handshake(tarpc::context::current(), handshake).await { - warn!("failed to perform client handshake: {e}"); + error!("failed to perform client handshake: {e}"); } }); @@ -209,7 +209,7 @@ async fn handle_socket( tokio::spawn(async move { // we do this in a separate task because we don't want to hold up pool insertion if let Err(e) = client.handshake(tarpc::context::current(), handshake).await { - warn!("failed to perform client handshake: {e}"); + error!("failed to perform client handshake: {e}"); } }); From 2d8d61451f282aad33d932f5e16883a77450f887 Mon Sep 17 00:00:00 2001 From: Zander Date: Mon, 8 Apr 2024 11:28:09 -0500 Subject: [PATCH 8/9] fix: PR changes without rust-analyzer :( --- crates/snops/src/schema/error.rs | 2 ++ crates/snops/src/schema/outcomes.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/snops/src/schema/error.rs b/crates/snops/src/schema/error.rs index 63cb3381..696c5da3 100644 --- a/crates/snops/src/schema/error.rs +++ b/crates/snops/src/schema/error.rs @@ -77,6 +77,7 @@ impl_into_status_code!(SchemaError, |value| match value { KeySource(e) => e.into(), NodeTarget(e) => e.into(), Storage(e) => e.into(), + QueryParse(e) => StatusCode::BAD_REQUEST, }); impl Serialize for SchemaError { @@ -91,6 +92,7 @@ impl Serialize for SchemaError { Self::KeySource(e) => state.serialize_field("error", e), Self::NodeTarget(e) => state.serialize_field("error", &e.to_string()), Self::Storage(e) => state.serialize_field("error", e), + Self::QueryParse(e) => state.serialize_field("error", &e), }?; state.end() diff --git a/crates/snops/src/schema/outcomes.rs b/crates/snops/src/schema/outcomes.rs index 6c2ac026..0c98093a 100644 --- a/crates/snops/src/schema/outcomes.rs +++ b/crates/snops/src/schema/outcomes.rs @@ -5,7 +5,7 @@ use lazy_static::lazy_static; use promql_parser::{label::Matcher, parser::ast::Expr as PromExpr}; use serde::{de::Visitor, Deserialize}; -// TODO: define built-in outcomes here or something +use super::error::SchemaError; /// A document describing a test's expected outcomes. #[derive(Deserialize, Debug, Clone)] From dfe05c717a06e05b55e43c2d5b8a055b244bb3b5 Mon Sep 17 00:00:00 2001 From: Zander Franks Date: Mon, 8 Apr 2024 17:20:54 -0500 Subject: [PATCH 9/9] chore: fix clippy Signed-off-by: Zander Franks --- crates/snops/src/env/timeline.rs | 2 +- crates/snops/src/server/prometheus.rs | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/snops/src/env/timeline.rs b/crates/snops/src/env/timeline.rs index 1d26a8c0..0a4d9b88 100644 --- a/crates/snops/src/env/timeline.rs +++ b/crates/snops/src/env/timeline.rs @@ -303,7 +303,7 @@ impl Environment { let Some(mut query) = outcome .query .as_ref() - .or_else(|| PromQuery::builtin(&outcome_name)) + .or_else(|| PromQuery::builtin(outcome_name)) .cloned() else { warn!("unrecognized metric name (no built-in query found)"); diff --git a/crates/snops/src/server/prometheus.rs b/crates/snops/src/server/prometheus.rs index 4677402c..2e6e9f68 100644 --- a/crates/snops/src/server/prometheus.rs +++ b/crates/snops/src/server/prometheus.rs @@ -103,7 +103,11 @@ async fn get_httpsd(State(state): State) -> impl IntoResponse { .collect(), }); } - _ => (), + + _ => { + // future-proofing; this comment also disables the + // clippy lint + } } }