From d9a6a8bf063e5948ff2ed13a59a98759b55e0e7c Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 9 Jan 2025 09:45:37 -0800 Subject: [PATCH] add end-to-end test that new blueprint on a fresh system is a noop (#7323) --- Cargo.lock | 6 + end-to-end-tests/Cargo.toml | 6 + end-to-end-tests/src/helpers/ctx.rs | 2 +- end-to-end-tests/src/lib.rs | 1 + end-to-end-tests/src/noop_blueprint.rs | 124 +++++++++++++++++++ nexus/types/src/deployment/blueprint_diff.rs | 36 ++++++ 6 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 end-to-end-tests/src/noop_blueprint.rs diff --git a/Cargo.lock b/Cargo.lock index 3bde26679f..40bd736257 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2884,9 +2884,12 @@ dependencies = [ "hickory-resolver", "http", "humantime", + "internal-dns-resolver", + "internal-dns-types", "internet-checksum", "ispf", "macaddr", + "nexus-client", "omicron-sled-agent", "omicron-test-utils", "omicron-workspace-hack", @@ -2898,7 +2901,10 @@ dependencies = [ "serde", "serde_json", "sled-agent-types", + "slog", + "slog-error-chain", "socket2", + "thiserror 1.0.69", "tokio", "toml 0.8.19", "uuid", diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index b5e63179da..bbf5eb20f6 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -15,6 +15,9 @@ bytes.workspace = true chrono.workspace = true http.workspace = true futures.workspace = true +internal-dns-resolver.workspace = true +internal-dns-types.workspace = true +nexus-client.workspace = true omicron-sled-agent.workspace = true omicron-test-utils.workspace = true oxide-client.workspace = true @@ -25,6 +28,9 @@ russh-keys = "0.45.0" serde.workspace = true serde_json.workspace = true sled-agent-types.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +thiserror.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } toml.workspace = true hickory-resolver.workspace = true diff --git a/end-to-end-tests/src/helpers/ctx.rs b/end-to-end-tests/src/helpers/ctx.rs index 5363557502..3301988b8a 100644 --- a/end-to-end-tests/src/helpers/ctx.rs +++ b/end-to-end-tests/src/helpers/ctx.rs @@ -73,7 +73,7 @@ impl Context { } } -fn rss_config() -> Result { +pub fn rss_config() -> Result { let path = "/opt/oxide/sled-agent/pkg/config-rss.toml"; let content = std::fs::read_to_string(&path).unwrap_or(RSS_CONFIG_STR.to_string()); diff --git a/end-to-end-tests/src/lib.rs b/end-to-end-tests/src/lib.rs index 330dc446b4..5994da687a 100644 --- a/end-to-end-tests/src/lib.rs +++ b/end-to-end-tests/src/lib.rs @@ -2,3 +2,4 @@ pub mod helpers; mod instance_launch; mod no_spoof; +mod noop_blueprint; diff --git a/end-to-end-tests/src/noop_blueprint.rs b/end-to-end-tests/src/noop_blueprint.rs new file mode 100644 index 0000000000..66fdf33772 --- /dev/null +++ b/end-to-end-tests/src/noop_blueprint.rs @@ -0,0 +1,124 @@ +//! Test that generating a new blueprint on a freshly-installed system will not +//! change anything. + +#![cfg(test)] + +use internal_dns_resolver::Resolver; +use internal_dns_types::names::ServiceName; +use nexus_client::Client as NexusClient; +use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; +use omicron_test_utils::dev::test_setup_log; +use slog::{debug, info}; +use slog_error_chain::InlineErrorChain; +use std::time::Duration; +use thiserror::Error; + +/// Test that generating a new blueprint on a freshly-installed system will not +/// change anything. +/// +/// If this test fails, there's probably a bug somewhere. Maybe the initial +/// system blueprint is incorrect or incomplete or maybe the planner is doing +/// the wrong thing after initial setup. +#[tokio::test] +async fn new_blueprint_noop() { + // In order to check anything with blueprints, we need to reach the Nexus + // internal API. This in turn requires finding the internal DNS servers. + let rss_config = + crate::helpers::ctx::rss_config().expect("loading RSS config"); + let rack_subnet = &rss_config.rack_network_config.rack_subnet; + println!("rack subnet: {}", rack_subnet); + let logctx = test_setup_log("new_blueprint_noop"); + let resolver = + Resolver::new_from_ip(logctx.log.clone(), rack_subnet.addr()) + .expect("creating internal DNS resolver"); + + // Wait up to 5 minutes to get a working Nexus client. + let nexus_client = wait_for_condition( + || async { + match make_nexus_client(&resolver, &logctx.log).await { + Ok(nexus_client) => Ok(nexus_client), + Err(e) => { + debug!( + &logctx.log, + "obtaining a working Nexus client failed"; + InlineErrorChain::new(&e), + ); + Err(CondCheckError::<()>::NotYet) + } + } + }, + &Duration::from_millis(500), + &Duration::from_secs(300), + ) + .await + .expect("timed out waiting to obtain a working Nexus client"); + println!("Nexus is running and has a target blueprint"); + + // Now generate a new blueprint. + let new_blueprint = nexus_client + .blueprint_regenerate() + .await + .expect("failed to generate new blueprint") + .into_inner(); + println!("new blueprint generated: {}", new_blueprint.id); + let parent_blueprint_id = new_blueprint + .parent_blueprint_id + .expect("generated blueprint always has a parent"); + println!("parent blueprint id: {}", parent_blueprint_id); + + // Fetch its parent. + let parent_blueprint = nexus_client + .blueprint_view(&parent_blueprint_id) + .await + .expect("failed to fetch parent blueprint") + .into_inner(); + + let diff = new_blueprint.diff_since_blueprint(&parent_blueprint); + println!("new blueprint: {}", new_blueprint.id); + println!("differences:"); + println!("{}", diff.display()); + + if diff.has_changes() { + panic!( + "unexpected changes between initial blueprint and \ + newly-generated one (see above)" + ); + } + + logctx.cleanup_successful(); +} + +/// Error returned by [`make_nexus_client()`]. +#[derive(Debug, Error)] +enum MakeNexusError { + #[error("looking up Nexus IP in internal DNS")] + Resolve(#[from] internal_dns_resolver::ResolveError), + #[error("making request to Nexus")] + Request(#[from] nexus_client::Error), +} + +/// Make one attempt to look up the IP of Nexus in internal DNS and make an HTTP +/// request to its internal API to fetch its current target blueprint. +/// +/// If this succeeds, Nexus is ready for the rest of this test to proceed. +/// +/// Returns a client for this Nexus. +async fn make_nexus_client( + resolver: &Resolver, + log: &slog::Logger, +) -> Result { + debug!(log, "doing DNS lookup for Nexus"); + let nexus_ip = resolver.lookup_socket_v6(ServiceName::Nexus).await?; + let url = format!("http://{}", nexus_ip); + debug!(log, "found Nexus IP"; "nexus_ip" => %nexus_ip, "url" => &url); + + let client = NexusClient::new(&url, log.clone()); + + // Once this call succeeds, Nexus is ready for us to proceed. + let blueprint_response = client.blueprint_target_view().await?.into_inner(); + info!(log, "found target blueprint (Nexus is ready)"; + "target_blueprint" => ?blueprint_response + ); + + Ok(client) +} diff --git a/nexus/types/src/deployment/blueprint_diff.rs b/nexus/types/src/deployment/blueprint_diff.rs index 3c5f7a120d..ebe61c5876 100644 --- a/nexus/types/src/deployment/blueprint_diff.rs +++ b/nexus/types/src/deployment/blueprint_diff.rs @@ -14,6 +14,7 @@ use super::{ zone_sort_key, Blueprint, ClickhouseClusterConfig, CockroachDbPreserveDowngrade, DiffBeforeClickhouseClusterConfig, }; +use diffus::Diffable; use nexus_sled_agent_shared::inventory::ZoneKind; use omicron_common::api::external::Generation; use omicron_common::disk::DiskIdentity; @@ -946,6 +947,41 @@ impl BlueprintDiff { pub fn display(&self) -> BlueprintDiffDisplay<'_> { BlueprintDiffDisplay::new(self) } + + /// Returns whether the diff reflects any changes or if the blueprints are + /// equivalent. + pub fn has_changes(&self) -> bool { + // Any changes to physical disks, datasets, or zones would be reflected + // in `self.sleds_modified`, `self.sleds_added`, or + // `self.sleds_removed`. + if !self.sleds_modified.is_empty() + || !self.sleds_added.is_empty() + || !self.sleds_removed.is_empty() + { + return true; + } + + // The clickhouse cluster config has changed if: + // - there was one before and now there isn't + // - there wasn't one before and now there is + // - there's one both before and after and their generation has changed + match ( + &self.before_clickhouse_cluster_config, + &self.after_clickhouse_cluster_config, + ) { + (DiffBeforeClickhouseClusterConfig::Blueprint(None), None) => false, + (DiffBeforeClickhouseClusterConfig::Blueprint(None), Some(_)) => { + true + } + (DiffBeforeClickhouseClusterConfig::Blueprint(Some(_)), None) => { + true + } + ( + DiffBeforeClickhouseClusterConfig::Blueprint(Some(before)), + Some(after), + ) => before.diff(&after).is_change(), + } + } } /// A printable representation of `ClickhouseClusterConfig` diff tables where