From 0cd723517ac6688dbe9d71c9b256f3300de3daa2 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Tue, 20 Aug 2024 19:52:23 -0700
Subject: [PATCH 01/24] first cut

---
 .config/nextest.toml                        |   2 +-
 Cargo.lock                                  |  30 +++
 Cargo.toml                                  |   2 +
 dev-tools/xtask/Cargo.toml                  |   2 +
 dev-tools/xtask/src/live_test.rs            | 150 +++++++++++++
 dev-tools/xtask/src/main.rs                 |   5 +
 live-tests/Cargo.toml                       |  37 ++++
 live-tests/build.rs                         |  10 +
 live-tests/tests/common/mod.rs              | 205 +++++++++++++++++
 live-tests/tests/test_nexus_add_remove.rs   | 233 ++++++++++++++++++++
 nexus/reconfigurator/preparation/src/lib.rs | 130 ++++++-----
 nexus/src/app/deployment.rs                 |  63 +-----
 12 files changed, 748 insertions(+), 121 deletions(-)
 create mode 100644 dev-tools/xtask/src/live_test.rs
 create mode 100644 live-tests/Cargo.toml
 create mode 100644 live-tests/build.rs
 create mode 100644 live-tests/tests/common/mod.rs
 create mode 100644 live-tests/tests/test_nexus_add_remove.rs

diff --git a/.config/nextest.toml b/.config/nextest.toml
index 95d4c20102..2d3660f9a4 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -10,7 +10,7 @@ experimental = ["setup-scripts"]
 [[profile.default.scripts]]
 # Exclude omicron-dev tests from crdb-seed as we explicitly want to simulate an
 # environment where the seed file doesn't exist.
-filter = 'rdeps(nexus-test-utils) - package(omicron-dev)'
+filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)'
 setup = 'crdb-seed'
 
 [profile.ci]
diff --git a/Cargo.lock b/Cargo.lock
index 6bd71f6d38..c5bf7b24ac 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5997,6 +5997,34 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "omicron-live-tests"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "assert_matches",
+ "dropshot",
+ "futures",
+ "internal-dns",
+ "nexus-client",
+ "nexus-config",
+ "nexus-db-model",
+ "nexus-db-queries",
+ "nexus-reconfigurator-planning",
+ "nexus-reconfigurator-preparation",
+ "nexus-sled-agent-shared",
+ "nexus-types",
+ "omicron-common",
+ "omicron-rpaths",
+ "omicron-test-utils",
+ "pq-sys",
+ "reqwest",
+ "serde",
+ "slog",
+ "tokio",
+ "uuid",
+]
+
 [[package]]
 name = "omicron-nexus"
 version = "0.1.0"
@@ -12171,6 +12199,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "camino",
+ "camino-tempfile",
  "cargo_metadata",
  "cargo_toml",
  "clap",
@@ -12179,6 +12208,7 @@ dependencies = [
  "serde",
  "swrite",
  "tabled",
+ "textwrap",
  "toml 0.8.19",
  "usdt",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 83aea83ddf..cbf660b8ce 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,6 +51,7 @@ members = [
     "internal-dns",
     "ipcc",
     "key-manager",
+    "live-tests",
     "nexus",
     "nexus-config",
     "nexus-sled-agent-shared",
@@ -168,6 +169,7 @@ default-members = [
     "internal-dns",
     "ipcc",
     "key-manager",
+    "live-tests",
     "nexus",
     "nexus-config",
     "nexus-sled-agent-shared",
diff --git a/dev-tools/xtask/Cargo.toml b/dev-tools/xtask/Cargo.toml
index ec1b7825c6..508d0c73ee 100644
--- a/dev-tools/xtask/Cargo.toml
+++ b/dev-tools/xtask/Cargo.toml
@@ -24,6 +24,7 @@ workspace = true
 # downstream binaries do depend on it.)
 anyhow.workspace = true
 camino.workspace = true
+camino-tempfile.workspace = true
 cargo_toml = "0.20"
 cargo_metadata.workspace = true
 clap.workspace = true
@@ -32,5 +33,6 @@ macaddr.workspace = true
 serde.workspace = true
 swrite.workspace = true
 tabled.workspace = true
+textwrap.workspace = true
 toml.workspace = true
 usdt.workspace = true
diff --git a/dev-tools/xtask/src/live_test.rs b/dev-tools/xtask/src/live_test.rs
new file mode 100644
index 0000000000..a2b203fc01
--- /dev/null
+++ b/dev-tools/xtask/src/live_test.rs
@@ -0,0 +1,150 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Subcommand: cargo xtask live-test
+
+use anyhow::{bail, Context, Result};
+use clap::Parser;
+use std::process::Command;
+
+// XXX-dap This should probably bail out if !illumos
+
+#[derive(Parser)]
+pub struct Args {}
+
+pub fn run_cmd(_args: Args) -> Result<()> {
+    let tmpdir =
+        camino_tempfile::tempdir().context("creating temporary directory")?;
+    let tarball = camino::Utf8PathBuf::try_from(
+        std::env::current_dir()
+            .map(|d| d.join("target"))
+            .context("getting current directory")?,
+    )
+    .context("non-UTF-8 current directory")?
+    .join("live-test-archive.tgz");
+    let contents = tmpdir.path().join("live-test-archive");
+    let archive_file = contents.join("omicron-live-tests.tar.zst");
+
+    eprintln!("using temporary directory: {}", tmpdir.path());
+    eprintln!("will create archive file:  {}", archive_file);
+    eprintln!("output tarball:            {}", tarball);
+    eprintln!();
+
+    std::fs::create_dir(&contents)
+        .with_context(|| format!("mkdir {:?}", &contents))?;
+
+    let cargo =
+        std::env::var("CARGO").unwrap_or_else(|_| String::from("cargo"));
+    let mut command = Command::new(&cargo);
+
+    command.arg("nextest");
+    command.arg("archive");
+    command.arg("--package");
+    command.arg("omicron-live-tests");
+    command.arg("--archive-file");
+    command.arg(&archive_file);
+    run_subcmd(command)?;
+
+    // Bundle up the source.
+    // XXX-dap using git here is debatable.
+    // XXX-dap consider adding nextest binary
+    let mut command = Command::new("git");
+    command.arg("archive");
+    command.arg("--format=tar.gz");
+    command.arg("--prefix=live-test-archive/");
+    command.arg(format!("--add-file={}", &archive_file));
+    command.arg("HEAD");
+    let archive_stream = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .truncate(true)
+        .open(&tarball)
+        .with_context(|| format!("open {:?}", tarball))?;
+    command.stdout(archive_stream);
+
+    run_subcmd(command)?;
+
+    drop(tmpdir);
+
+    eprint!("created: ");
+    println!("{}", &tarball);
+    eprintln!("\nTo use this:\n");
+    eprintln!(
+        "1. Copy the tarball to the switch zone in a deployed Omicron system.\n"
+    );
+    let raw = &[
+        "scp \\",
+        &format!("{} \\", &tarball),
+        "root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root",
+    ]
+    .join("\n");
+    let text = textwrap::wrap(
+        &raw,
+        textwrap::Options::new(160)
+            .initial_indent("     e.g., ")
+            .subsequent_indent("              "),
+    );
+    eprintln!("{}\n", text.join("\n"));
+    eprintln!("2. Copy the `cargo-nextest` binary to the same place.\n");
+    let raw = &[
+        "scp \\",
+        "$(which cargo-nextest) \\",
+        "root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root",
+    ]
+    .join("\n");
+    let text = textwrap::wrap(
+        &raw,
+        textwrap::Options::new(160)
+            .initial_indent("     e.g., ")
+            .subsequent_indent("              "),
+    );
+    eprintln!("{}\n", text.join("\n"));
+    eprintln!("3. On that system, unpack the tarball with:\n");
+    eprintln!("     tar xzf {}\n", tarball.file_name().unwrap());
+    eprintln!("4. On that system, run tests with:\n");
+    let raw = &[
+        "./cargo-nextest nextest run \\",
+        &format!(
+            "--archive-file {}/{} \\",
+            contents.file_name().unwrap(),
+            archive_file.file_name().unwrap()
+        ),
+        &format!("--workspace-remap {}", contents.file_name().unwrap()),
+    ]
+    .join("\n");
+    let text = textwrap::wrap(
+        &raw,
+        textwrap::Options::new(160)
+            .initial_indent("     ")
+            .subsequent_indent("         "),
+    );
+    eprintln!("{}\n", text.join("\n"));
+
+    Ok(())
+}
+
+// XXX-dap commonize with clippy
+fn run_subcmd(mut command: Command) -> Result<()> {
+    eprintln!(
+        "running: {} {}",
+        command.get_program().to_str().unwrap(),
+        command
+            .get_args()
+            .map(|arg| format!("{:?}", arg.to_str().unwrap()))
+            .collect::<Vec<_>>()
+            .join(" ")
+    );
+
+    let exit_status = command
+        .spawn()
+        .context("failed to spawn child process")?
+        .wait()
+        .context("failed to wait for child process")?;
+
+    if !exit_status.success() {
+        bail!("failed: {}", exit_status);
+    }
+
+    Ok(())
+}
diff --git a/dev-tools/xtask/src/main.rs b/dev-tools/xtask/src/main.rs
index 02fd05a198..0d61c1035c 100644
--- a/dev-tools/xtask/src/main.rs
+++ b/dev-tools/xtask/src/main.rs
@@ -18,6 +18,7 @@ mod check_workspace_deps;
 mod clippy;
 #[cfg_attr(not(target_os = "illumos"), allow(dead_code))]
 mod external;
+mod live_test;
 mod usdt;
 
 #[cfg(target_os = "illumos")]
@@ -59,6 +60,9 @@ enum Cmds {
     /// Download binaries, OpenAPI specs, and other out-of-repo utilities.
     Download(external::External),
 
+    /// Create a bundle of live tests
+    LiveTest(live_test::Args),
+
     /// Utilities for working with MGS.
     MgsDev(external::External),
     /// Utilities for working with Omicron.
@@ -127,6 +131,7 @@ fn main() -> Result<()> {
                 external.exec_bin("xtask-downloader")
             }
         }
+        Cmds::LiveTest(args) => live_test::run_cmd(args),
         Cmds::MgsDev(external) => external.exec_bin("mgs-dev"),
         Cmds::OmicronDev(external) => external.exec_bin("omicron-dev"),
         Cmds::Openapi(external) => external.exec_bin("openapi-manager"),
diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml
new file mode 100644
index 0000000000..528b653c0b
--- /dev/null
+++ b/live-tests/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "omicron-live-tests"
+version = "0.1.0"
+edition = "2021"
+license = "MPL-2.0"
+
+[build-dependencies]
+omicron-rpaths.workspace = true
+
+[dependencies]
+# See omicron-rpaths for more about the "pq-sys" dependency.
+pq-sys = "*"
+
+[dev-dependencies]
+anyhow.workspace = true
+assert_matches.workspace = true
+dropshot.workspace = true
+futures.workspace = true
+internal-dns.workspace = true
+nexus-client.workspace = true
+nexus-config.workspace = true
+nexus-db-model.workspace = true
+nexus-db-queries.workspace = true
+nexus-reconfigurator-planning.workspace = true
+nexus-reconfigurator-preparation.workspace = true
+nexus-sled-agent-shared.workspace = true
+nexus-types.workspace = true
+omicron-common.workspace = true
+omicron-test-utils.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+slog.workspace = true
+tokio.workspace = true
+uuid.workspace = true
+
+[lints]
+workspace = true
diff --git a/live-tests/build.rs b/live-tests/build.rs
new file mode 100644
index 0000000000..1ba9acd41c
--- /dev/null
+++ b/live-tests/build.rs
@@ -0,0 +1,10 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+// See omicron-rpaths for documentation.
+// NOTE: This file MUST be kept in sync with the other build.rs files in this
+// repository.
+fn main() {
+    omicron_rpaths::configure_default_omicron_rpaths();
+}
diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
new file mode 100644
index 0000000000..36eede69b6
--- /dev/null
+++ b/live-tests/tests/common/mod.rs
@@ -0,0 +1,205 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use anyhow::{anyhow, bail, Context};
+use dropshot::test_util::LogContext;
+use internal_dns::resolver::Resolver;
+use internal_dns::ServiceName;
+use nexus_config::PostgresConfigWithUrl;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::DataStore;
+use nexus_types::deployment::SledFilter;
+use omicron_common::address::Ipv6Subnet;
+use slog::info;
+use slog::o;
+use std::net::SocketAddrV6;
+use std::sync::Arc;
+
+// XXX-dap point tmp at some place other than /tmp
+
+pub struct LiveTestContext {
+    logctx: LogContext,
+    opctx: OpContext,
+    resolver: Resolver,
+    datastore: Arc<DataStore>,
+}
+
+impl LiveTestContext {
+    pub async fn new(
+        test_name: &'static str,
+    ) -> Result<LiveTestContext, anyhow::Error> {
+        // XXX-dap check that we're actually running inside an environment and
+        // not in some workspace
+        let logctx = omicron_test_utils::dev::test_setup_log(test_name);
+        let log = &logctx.log;
+        let resolver = create_resolver(log)?;
+        let datastore = create_datastore(&log, &resolver).await?;
+        let opctx = OpContext::for_tests(log.clone(), datastore.clone());
+        check_environment(&opctx, &datastore).await?;
+        Ok(LiveTestContext { logctx, opctx, resolver, datastore })
+    }
+
+    pub fn cleanup_successful(self) {
+        self.logctx.cleanup_successful();
+    }
+
+    pub fn log(&self) -> &slog::Logger {
+        &self.logctx.log
+    }
+
+    pub async fn any_internal_nexus_client(
+        &self,
+    ) -> Result<nexus_client::Client, anyhow::Error> {
+        let sockaddr = self
+            .resolver
+            .lookup_socket_v6(ServiceName::Nexus)
+            .await
+            .context("looking up Nexus in internal DNS")?;
+        Ok(self.specific_internal_nexus_client(sockaddr))
+    }
+
+    pub fn specific_internal_nexus_client(
+        &self,
+        sockaddr: SocketAddrV6,
+    ) -> nexus_client::Client {
+        let url = format!("http://{}", sockaddr);
+        let log = self.logctx.log.new(o!("nexus_internal_url" => url.clone()));
+        nexus_client::Client::new(&url, log)
+    }
+
+    pub async fn all_internal_nexus_clients(
+        &self,
+    ) -> Result<Vec<nexus_client::Client>, anyhow::Error> {
+        Ok(self
+            .resolver
+            .lookup_all_socket_v6(ServiceName::Nexus)
+            .await
+            .context("looking up Nexus in internal DNS")?
+            .into_iter()
+            .map(|s| self.specific_internal_nexus_client(s))
+            .collect())
+    }
+
+    pub fn opctx(&self) -> &OpContext {
+        &self.opctx
+    }
+
+    pub fn datastore(&self) -> &DataStore {
+        &self.datastore
+    }
+}
+
+fn create_resolver(log: &slog::Logger) -> Result<Resolver, anyhow::Error> {
+    // In principle, we should look at /etc/resolv.conf to find the
+    // DNS servers.  In practice, this usually isn't populated
+    // today.  See oxidecomputer/omicron#2122.
+    //
+    // However, the address selected below should work for most
+    // existing Omicron deployments today.  That's because while the
+    // base subnet is in principle configurable in config-rss.toml,
+    // it's very uncommon to change it from the default value used
+    // here.
+    let subnet = Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap());
+    eprintln!("note: using DNS server for subnet {}", subnet.net());
+    internal_dns::resolver::Resolver::new_from_subnet(log.clone(), subnet)
+        .with_context(|| {
+            format!("creating DNS resolver for subnet {}", subnet.net())
+        })
+}
+
+async fn create_datastore(
+    log: &slog::Logger,
+    resolver: &Resolver,
+) -> Result<Arc<DataStore>, anyhow::Error> {
+    let sockaddrs = resolver
+        .lookup_all_socket_v6(ServiceName::Cockroach)
+        .await
+        .context("resolving CockroachDB")?;
+
+    let url = format!(
+        "postgresql://root@{}/omicron?sslmode=disable",
+        sockaddrs
+            .into_iter()
+            .map(|a| a.to_string())
+            .collect::<Vec<_>>()
+            .join(",")
+    )
+    .parse::<PostgresConfigWithUrl>()
+    .context("failed to parse constructed postgres URL")?;
+
+    let db_config = nexus_db_queries::db::Config { url };
+    let pool = Arc::new(nexus_db_queries::db::Pool::new(log, &db_config));
+    let datastore = DataStore::new_unchecked(log.clone(), pool)
+        .map_err(|s| anyhow!("creating DataStore: {s}"))?;
+
+    // XXX-dap TODO-cleanup put all this into a Datastore::new_nowait() or
+    // something
+    let expected_version = nexus_db_model::SCHEMA_VERSION;
+    let (found_version, found_target) = datastore
+        .database_schema_version()
+        .await
+        .context("loading database schema version")?;
+    eprintln!(
+        "create_datastore(): found_version {found_version}, \
+        found_target {found_target:?}"
+    );
+
+    if let Some(found_target) = found_target {
+        bail!(
+            "database schema check failed: apparently mid-upgrade \
+            (found_target = {found_target})"
+        );
+    }
+
+    if found_version != expected_version {
+        bail!(
+            "database schema check failed: \
+            expected {expected_version}, found {found_version}",
+        );
+    }
+
+    Ok(Arc::new(datastore))
+}
+
+async fn check_environment(
+    opctx: &OpContext,
+    datastore: &DataStore,
+) -> Result<(), anyhow::Error> {
+    const ALLOWED_GIMLET_SERIALS: &[&str] = &[
+        // test rig: "madrid"
+        "BRM42220004",
+        "BRM42220081",
+        "BRM42220007",
+        "BRM42220046",
+        // test rig: "london"
+        "BRM42220036",
+        "BRM42220062",
+        "BRM42220030",
+        "BRM44220007",
+    ];
+
+    // Refuse to operate in an environment that might contain real Oxide
+    // hardware that's not known to be part of a test rig.  This is deliberately
+    // conservative.
+    let scary_sleds = datastore
+        .sled_list_all_batched(opctx, SledFilter::Commissioned)
+        .await
+        .context("check_environment: listing commissioned sleds")?
+        .into_iter()
+        .filter_map(|s| {
+            (s.part_number() != "i86pc"
+                && !ALLOWED_GIMLET_SERIALS.contains(&s.serial_number()))
+            .then(|| s.serial_number().to_owned())
+        })
+        .collect::<Vec<_>>();
+    if scary_sleds.is_empty() {
+        info!(&opctx.log, "environment verified");
+        Ok(())
+    } else {
+        Err(anyhow!(
+            "refusing to operate in an environment with an unknown system: {}",
+            scary_sleds.join(", ")
+        ))
+    }
+}
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
new file mode 100644
index 0000000000..d969ae54c1
--- /dev/null
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -0,0 +1,233 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+mod common;
+
+use anyhow::Context;
+use assert_matches::assert_matches;
+use common::LiveTestContext;
+use futures::TryStreamExt;
+use nexus_client::types::BlueprintTargetSet;
+use nexus_client::types::Saga;
+use nexus_client::types::SagaState;
+use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
+use nexus_reconfigurator_planning::blueprint_builder::EnsureMultiple;
+use nexus_reconfigurator_preparation::PlanningInputFromDb;
+use nexus_sled_agent_shared::inventory::ZoneKind;
+use nexus_types::deployment::SledFilter;
+use omicron_common::address::NEXUS_INTERNAL_PORT;
+use omicron_test_utils::dev::poll::wait_for_condition;
+use omicron_test_utils::dev::poll::CondCheckError;
+use slog::{debug, info, o};
+use std::net::SocketAddrV6;
+use std::time::Duration;
+
+// XXX-dap clean up in general
+// XXX-dap clean up logging
+#[tokio::test]
+async fn test_nexus_add_remove() {
+    let lc = LiveTestContext::new("test_nexus_add_remove").await.unwrap();
+    let log = lc.log();
+    let mylog = lc.log().new(o!("top-level" => true));
+
+    let nexus =
+        lc.any_internal_nexus_client().await.expect("internal Nexus client");
+    info!(mylog, "have Nexus client");
+
+    // Fetch the current target blueprint.
+    let target_blueprint = nexus
+        .blueprint_target_view()
+        .await
+        .expect("fetch target config")
+        .into_inner();
+    if !target_blueprint.enabled {
+        panic!("refusing to modify a system with target blueprint disabled");
+    }
+
+    let blueprint1 = nexus
+        .blueprint_view(&target_blueprint.target_id)
+        .await
+        .expect("fetch target blueprint")
+        .into_inner();
+    info!(mylog, "have current blueprint");
+
+    // Assemble a new blueprint based on the current target that adds a Nexus
+    // zone on some sled.
+    let opctx = lc.opctx();
+    let datastore = lc.datastore();
+    let planning_input = PlanningInputFromDb::assemble(&opctx, &datastore)
+        .await
+        .expect("planning input");
+    info!(mylog, "have PlanningInput");
+
+    let mut builder = BlueprintBuilder::new_based_on(
+        log,
+        &blueprint1,
+        &planning_input,
+        "test-suite",
+    )
+    .expect("BlueprintBuilder");
+
+    let sled_id = planning_input
+        .all_sled_ids(SledFilter::Commissioned)
+        .next()
+        .expect("any sled id");
+
+    let nnexus =
+        builder.sled_num_running_zones_of_kind(sled_id, ZoneKind::Nexus);
+    let count = builder
+        .sled_ensure_zone_multiple_nexus(sled_id, nnexus + 1)
+        .expect("adding Nexus");
+    assert_matches!(count, EnsureMultiple::Changed { added: 1, removed: 0 });
+    let blueprint2 = builder.build();
+    info!(mylog, "built new blueprint";
+        "blueprint1_id" => %blueprint1.id,
+        "blueprint2_id" => %blueprint2.id
+    );
+
+    // Make this the new target.
+    nexus.blueprint_import(&blueprint2).await.expect("importing new blueprint");
+    nexus
+        .blueprint_target_set(&BlueprintTargetSet {
+            enabled: true,
+            target_id: blueprint2.id,
+        })
+        .await
+        .expect("setting new target");
+    info!(mylog, "imported blueprint"; "blueprint_id" => %blueprint2.id);
+
+    // Figure out which zone is new and make a new client for it.
+    let diff = blueprint2.diff_since_blueprint(&blueprint1);
+    let new_zone = diff
+        .zones
+        .added
+        .values()
+        .next()
+        .expect("at least one sled with added zones")
+        .zones
+        .iter()
+        .next()
+        .expect("at least one added zone on that sled");
+    assert_eq!(new_zone.kind(), ZoneKind::Nexus);
+    let new_zone_addr = new_zone.underlay_address();
+    let new_zone_sockaddr =
+        SocketAddrV6::new(new_zone_addr, NEXUS_INTERNAL_PORT, 0, 0);
+    let new_zone_client = lc.specific_internal_nexus_client(new_zone_sockaddr);
+
+    // Wait for the new Nexus zone to show up and be usable.
+    let initial_sagas_list = wait_for_condition(
+        || async {
+            list_sagas(&new_zone_client).await.map_err(|e| {
+                debug!(log,
+                    "waiting for new Nexus to be available: listing sagas: {e:#}"
+                );
+                CondCheckError::<()>::NotYet
+            })
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(60),
+    )
+    .await
+    .expect("new Nexus to be usable");
+    assert!(initial_sagas_list.is_empty());
+    info!(mylog, "new Nexus is online");
+
+    // Create a demo saga.
+    let demo_saga = new_zone_client
+        .saga_demo_create()
+        .await
+        .expect("new Nexus saga demo create");
+    let saga_id = demo_saga.saga_id;
+
+    let sagas_list =
+        list_sagas(&new_zone_client).await.expect("new Nexus sagas_list");
+    assert_eq!(sagas_list.len(), 1);
+    assert_eq!(sagas_list[0].id, saga_id);
+    info!(mylog, "ready to expunge");
+
+    let mut builder = BlueprintBuilder::new_based_on(
+        log,
+        &blueprint2,
+        &planning_input,
+        "test-suite",
+    )
+    .expect("BlueprintBuilder");
+    builder.sled_expunge_zone(sled_id, new_zone.id()).expect("expunge zone");
+    let blueprint3 = builder.build();
+    nexus.blueprint_import(&blueprint3).await.expect("importing new blueprint");
+    nexus
+        .blueprint_target_set(&BlueprintTargetSet {
+            enabled: true,
+            target_id: blueprint3.id,
+        })
+        .await
+        .expect("setting new target");
+    info!(mylog, "imported blueprint with Nexus expunged and set target";
+        "blueprint_id" => %blueprint3.id
+    );
+
+    // Wait for some other Nexus instance to pick up the saga.
+    let nexus_clients = lc.all_internal_nexus_clients().await.unwrap();
+    let nexus_found = wait_for_condition(
+        || async {
+            for nexus_client in &nexus_clients {
+                let Ok(sagas) = list_sagas(&nexus_client).await else {
+                    continue;
+                };
+
+                debug!(mylog, "found sagas (last): {:?}", sagas);
+                if sagas.into_iter().any(|s| s.id == saga_id) {
+                    return Ok(nexus_client);
+                }
+            }
+
+            return Err(CondCheckError::<()>::NotYet);
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(60),
+    )
+    .await
+    .unwrap();
+
+    info!(mylog, "found saga in previous Nexus instance");
+    // Now, complete it on this instance.
+    nexus_found
+        .saga_demo_complete(&demo_saga.demo_saga_id)
+        .await
+        .expect("complete demo saga");
+
+    // Completion is not synchronous -- that just unblocked the saga.  So we
+    // need to poll a bit to wait for it to actually finish.
+    let found = wait_for_condition(
+        || async {
+            let sagas = list_sagas(&nexus_found).await.expect("listing sagas");
+            debug!(mylog, "found sagas (last): {:?}", sagas);
+            let found = sagas.into_iter().find(|s| s.id == saga_id).unwrap();
+            if matches!(found.state, SagaState::Succeeded) {
+                Ok(found)
+            } else {
+                Err(CondCheckError::<()>::NotYet)
+            }
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(30),
+    )
+    .await
+    .unwrap();
+
+    assert_eq!(found.id, saga_id);
+    assert!(matches!(found.state, SagaState::Succeeded));
+
+    lc.cleanup_successful();
+}
+
+async fn list_sagas(
+    client: &nexus_client::Client,
+) -> Result<Vec<Saga>, anyhow::Error> {
+    client
+        .saga_list_stream(None, None)
+        .try_collect::<Vec<_>>()
+        .await
+        .context("listing sagas")
+}
diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs
index fc0e4638f8..a637241aaa 100644
--- a/nexus/reconfigurator/preparation/src/lib.rs
+++ b/nexus/reconfigurator/preparation/src/lib.rs
@@ -35,6 +35,7 @@ use omicron_common::address::IpRange;
 use omicron_common::address::Ipv6Subnet;
 use omicron_common::address::SLED_PREFIX;
 use omicron_common::api::external::Error;
+use omicron_common::api::external::InternalContext;
 use omicron_common::api::external::LookupType;
 use omicron_common::disk::DiskIdentity;
 use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY;
@@ -72,6 +73,74 @@ pub struct PlanningInputFromDb<'a> {
 }
 
 impl PlanningInputFromDb<'_> {
+    pub async fn assemble(
+        opctx: &OpContext,
+        datastore: &DataStore,
+    ) -> Result<PlanningInput, Error> {
+        opctx.check_complex_operations_allowed()?;
+        let sled_rows = datastore
+            .sled_list_all_batched(opctx, SledFilter::Commissioned)
+            .await
+            .internal_context("fetching all sleds")?;
+        let zpool_rows = datastore
+            .zpool_list_all_external_batched(opctx)
+            .await
+            .internal_context("fetching all external zpool rows")?;
+        let ip_pool_range_rows = {
+            let (authz_service_ip_pool, _) = datastore
+                .ip_pools_service_lookup(opctx)
+                .await
+                .internal_context("fetching IP services pool")?;
+            datastore
+                .ip_pool_list_ranges_batched(opctx, &authz_service_ip_pool)
+                .await
+                .internal_context("listing services IP pool ranges")?
+        };
+        let external_ip_rows = datastore
+            .external_ip_list_service_all_batched(opctx)
+            .await
+            .internal_context("fetching service external IPs")?;
+        let service_nic_rows = datastore
+            .service_network_interfaces_all_list_batched(opctx)
+            .await
+            .internal_context("fetching service NICs")?;
+        let internal_dns_version = datastore
+            .dns_group_latest_version(opctx, DnsGroup::Internal)
+            .await
+            .internal_context("fetching internal DNS version")?
+            .version;
+        let external_dns_version = datastore
+            .dns_group_latest_version(opctx, DnsGroup::External)
+            .await
+            .internal_context("fetching external DNS version")?
+            .version;
+        let cockroachdb_settings = datastore
+            .cockroachdb_settings(opctx)
+            .await
+            .internal_context("fetching cockroachdb settings")?;
+
+        let planning_input = PlanningInputFromDb {
+            sled_rows: &sled_rows,
+            zpool_rows: &zpool_rows,
+            ip_pool_range_rows: &ip_pool_range_rows,
+            target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY,
+            target_nexus_zone_count: NEXUS_REDUNDANCY,
+            target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY,
+            target_cockroachdb_cluster_version:
+                CockroachDbClusterVersion::POLICY,
+            external_ip_rows: &external_ip_rows,
+            service_nic_rows: &service_nic_rows,
+            log: &opctx.log,
+            internal_dns_version,
+            external_dns_version,
+            cockroachdb_settings: &cockroachdb_settings,
+        }
+        .build()
+        .internal_context("assembling planning_input")?;
+
+        Ok(planning_input)
+    }
+
     pub fn build(&self) -> Result<PlanningInput, Error> {
         let service_ip_pool_ranges =
             self.ip_pool_range_rows.iter().map(IpRange::from).collect();
@@ -195,65 +264,8 @@ pub async fn reconfigurator_state_load(
     datastore: &DataStore,
 ) -> Result<UnstableReconfiguratorState, anyhow::Error> {
     opctx.check_complex_operations_allowed()?;
-    let sled_rows = datastore
-        .sled_list_all_batched(opctx, SledFilter::Commissioned)
-        .await
-        .context("listing sleds")?;
-    let zpool_rows = datastore
-        .zpool_list_all_external_batched(opctx)
-        .await
-        .context("listing zpools")?;
-    let ip_pool_range_rows = {
-        let (authz_service_ip_pool, _) = datastore
-            .ip_pools_service_lookup(opctx)
-            .await
-            .context("fetching IP services pool")?;
-        datastore
-            .ip_pool_list_ranges_batched(opctx, &authz_service_ip_pool)
-            .await
-            .context("listing services IP pool ranges")?
-    };
-    let external_ip_rows = datastore
-        .external_ip_list_service_all_batched(opctx)
-        .await
-        .context("fetching service external IPs")?;
-    let service_nic_rows = datastore
-        .service_network_interfaces_all_list_batched(opctx)
-        .await
-        .context("fetching service NICs")?;
-    let internal_dns_version = datastore
-        .dns_group_latest_version(opctx, DnsGroup::Internal)
-        .await
-        .context("fetching internal DNS version")?
-        .version;
-    let external_dns_version = datastore
-        .dns_group_latest_version(opctx, DnsGroup::External)
-        .await
-        .context("fetching external DNS version")?
-        .version;
-    let cockroachdb_settings = datastore
-        .cockroachdb_settings(opctx)
-        .await
-        .context("fetching cockroachdb settings")?;
-
-    let planning_input = PlanningInputFromDb {
-        sled_rows: &sled_rows,
-        zpool_rows: &zpool_rows,
-        ip_pool_range_rows: &ip_pool_range_rows,
-        target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY,
-        target_nexus_zone_count: NEXUS_REDUNDANCY,
-        target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY,
-        target_cockroachdb_cluster_version: CockroachDbClusterVersion::POLICY,
-        external_ip_rows: &external_ip_rows,
-        service_nic_rows: &service_nic_rows,
-        log: &opctx.log,
-        internal_dns_version,
-        external_dns_version,
-        cockroachdb_settings: &cockroachdb_settings,
-    }
-    .build()
-    .context("assembling planning_input")?;
-
+    let planning_input =
+        PlanningInputFromDb::assemble(opctx, datastore).await?;
     let collection_ids = datastore
         .inventory_collections()
         .await
diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs
index 50ae332d3f..79e7a93e6d 100644
--- a/nexus/src/app/deployment.rs
+++ b/nexus/src/app/deployment.rs
@@ -4,7 +4,6 @@
 
 //! Configuration of the deployment system
 
-use nexus_db_model::DnsGroup;
 use nexus_db_queries::authz;
 use nexus_db_queries::context::OpContext;
 use nexus_reconfigurator_planning::planner::Planner;
@@ -13,9 +12,7 @@ use nexus_types::deployment::Blueprint;
 use nexus_types::deployment::BlueprintMetadata;
 use nexus_types::deployment::BlueprintTarget;
 use nexus_types::deployment::BlueprintTargetSet;
-use nexus_types::deployment::CockroachDbClusterVersion;
 use nexus_types::deployment::PlanningInput;
-use nexus_types::deployment::SledFilter;
 use nexus_types::inventory::Collection;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::DataPageParams;
@@ -25,9 +22,6 @@ use omicron_common::api::external::InternalContext;
 use omicron_common::api::external::ListResultVec;
 use omicron_common::api::external::LookupResult;
 use omicron_common::api::external::LookupType;
-use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY;
-use omicron_common::policy::COCKROACHDB_REDUNDANCY;
-use omicron_common::policy::NEXUS_REDUNDANCY;
 use slog_error_chain::InlineErrorChain;
 use uuid::Uuid;
 
@@ -132,61 +126,8 @@ impl super::Nexus {
     ) -> Result<PlanningContext, Error> {
         let creator = self.id.to_string();
         let datastore = self.datastore();
-
-        let sled_rows = datastore
-            .sled_list_all_batched(opctx, SledFilter::Commissioned)
-            .await?;
-        let zpool_rows =
-            datastore.zpool_list_all_external_batched(opctx).await?;
-        let ip_pool_range_rows = {
-            let (authz_service_ip_pool, _) =
-                datastore.ip_pools_service_lookup(opctx).await?;
-            datastore
-                .ip_pool_list_ranges_batched(opctx, &authz_service_ip_pool)
-                .await?
-        };
-        let external_ip_rows =
-            datastore.external_ip_list_service_all_batched(opctx).await?;
-        let service_nic_rows = datastore
-            .service_network_interfaces_all_list_batched(opctx)
-            .await?;
-
-        let internal_dns_version = datastore
-            .dns_group_latest_version(opctx, DnsGroup::Internal)
-            .await
-            .internal_context(
-                "fetching internal DNS version for blueprint planning",
-            )?
-            .version;
-        let external_dns_version = datastore
-            .dns_group_latest_version(opctx, DnsGroup::External)
-            .await
-            .internal_context(
-                "fetching external DNS version for blueprint planning",
-            )?
-            .version;
-        let cockroachdb_settings =
-            datastore.cockroachdb_settings(opctx).await.internal_context(
-                "fetching cockroachdb settings for blueprint planning",
-            )?;
-
-        let planning_input = PlanningInputFromDb {
-            sled_rows: &sled_rows,
-            zpool_rows: &zpool_rows,
-            ip_pool_range_rows: &ip_pool_range_rows,
-            external_ip_rows: &external_ip_rows,
-            service_nic_rows: &service_nic_rows,
-            target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY,
-            target_nexus_zone_count: NEXUS_REDUNDANCY,
-            target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY,
-            target_cockroachdb_cluster_version:
-                CockroachDbClusterVersion::POLICY,
-            log: &opctx.log,
-            internal_dns_version,
-            external_dns_version,
-            cockroachdb_settings: &cockroachdb_settings,
-        }
-        .build()?;
+        let planning_input =
+            PlanningInputFromDb::assemble(opctx, datastore).await?;
 
         // The choice of which inventory collection to use here is not
         // necessarily trivial.  Inventory collections may be incomplete due to

From ec5fadbf14324bec65090b26281aa85ebbf17f1e Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 21 Aug 2024 09:03:12 -0700
Subject: [PATCH 02/24] add CI that builds it

---
 .github/buildomat/jobs/build-live-test.sh | 27 +++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100755 .github/buildomat/jobs/build-live-test.sh

diff --git a/.github/buildomat/jobs/build-live-test.sh b/.github/buildomat/jobs/build-live-test.sh
new file mode 100755
index 0000000000..df3b2712b5
--- /dev/null
+++ b/.github/buildomat/jobs/build-live-test.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+#:
+#: name = "build-live-tests"
+#: variety = "basic"
+#: target = "helios-2.0"
+#: rust_toolchain = true
+#: output_rules = []
+
+# Builds the "live-tests" nextest archive
+
+set -o errexit
+set -o pipefail
+set -o xtrace
+
+cargo --version
+rustc --version
+
+#
+# Set up our PATH for use with this workspace.
+#
+source ./env.sh
+
+banner prerequisites
+ptime -m bash ./tools/install_builder_prerequisites.sh -y
+
+banner live-test
+ptime -m cargo xtask live-test

From 4544934babb70e6b7f67f0407390e1ee011e2f1c Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 21 Aug 2024 09:13:46 -0700
Subject: [PATCH 03/24] avoid trying to run tests by default (does not seem
 like a great way)

---
 Cargo.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index cbf660b8ce..9f9e7aa5f4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -169,7 +169,8 @@ default-members = [
     "internal-dns",
     "ipcc",
     "key-manager",
-    "live-tests",
+    # Do not include live-tests in the list of default members because its tests
+    # only work in a deployed system.
     "nexus",
     "nexus-config",
     "nexus-sled-agent-shared",

From 8f1cc51a35b0c414d5641ad6fb0f6e861c96903b Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 21 Aug 2024 09:45:18 -0700
Subject: [PATCH 04/24] bail out faster on non-illumos

---
 Cargo.lock                       |  2 ++
 dev-tools/xtask/src/live_test.rs | 12 ++++++++++--
 live-tests/Cargo.toml            |  2 ++
 live-tests/tests/common/mod.rs   | 31 ++++++++++++++++++++++++++++---
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c5bf7b24ac..84cfa67213 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6021,6 +6021,8 @@ dependencies = [
  "reqwest",
  "serde",
  "slog",
+ "slog-error-chain",
+ "textwrap",
  "tokio",
  "uuid",
 ]
diff --git a/dev-tools/xtask/src/live_test.rs b/dev-tools/xtask/src/live_test.rs
index a2b203fc01..1f0fd22bc5 100644
--- a/dev-tools/xtask/src/live_test.rs
+++ b/dev-tools/xtask/src/live_test.rs
@@ -8,12 +8,20 @@ use anyhow::{bail, Context, Result};
 use clap::Parser;
 use std::process::Command;
 
-// XXX-dap This should probably bail out if !illumos
-
 #[derive(Parser)]
 pub struct Args {}
 
 pub fn run_cmd(_args: Args) -> Result<()> {
+    // The live tests operate in deployed environments, which always run
+    // illumos.  Bail out quickly if someone tries to run this on a system whose
+    // binaries won't be usable.  (We could compile this subcommand out
+    // altogether on non-illumos systems, but it seems more confusing to be
+    // silently missing something you might expect to be there.  Plus, you can
+    // still check and even build *this* code on non-illumos systems.)
+    if cfg!(not(target_os = "illumos")) {
+        bail!("live-test archive can only be built on illumos systems");
+    }
+
     let tmpdir =
         camino_tempfile::tempdir().context("creating temporary directory")?;
     let tarball = camino::Utf8PathBuf::try_from(
diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml
index 528b653c0b..2a27e8f848 100644
--- a/live-tests/Cargo.toml
+++ b/live-tests/Cargo.toml
@@ -30,6 +30,8 @@ omicron-test-utils.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 slog.workspace = true
+slog-error-chain.workspace = true
+textwrap.workspace = true
 tokio.workspace = true
 uuid.workspace = true
 
diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index 36eede69b6..bec0202fad 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -2,7 +2,7 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
-use anyhow::{anyhow, bail, Context};
+use anyhow::{anyhow, bail, ensure, Context};
 use dropshot::test_util::LogContext;
 use internal_dns::resolver::Resolver;
 use internal_dns::ServiceName;
@@ -34,9 +34,10 @@ impl LiveTestContext {
         let logctx = omicron_test_utils::dev::test_setup_log(test_name);
         let log = &logctx.log;
         let resolver = create_resolver(log)?;
+        check_execution_environment(&resolver).await?;
         let datastore = create_datastore(&log, &resolver).await?;
         let opctx = OpContext::for_tests(log.clone(), datastore.clone());
-        check_environment(&opctx, &datastore).await?;
+        check_hardware_environment(&opctx, &datastore).await?;
         Ok(LiveTestContext { logctx, opctx, resolver, datastore })
     }
 
@@ -162,7 +163,31 @@ async fn create_datastore(
     Ok(Arc::new(datastore))
 }
 
-async fn check_environment(
+async fn check_execution_environment(
+    resolver: &Resolver,
+) -> Result<(), anyhow::Error> {
+    ensure!(
+        cfg!(target_os = "illumos"),
+        "live tests can only be run on deployed systems, which run illumos"
+    );
+
+    resolver.lookup_ip(ServiceName::InternalDns).await.map(|_| ()).map_err(
+        |e| {
+            let text = format!(
+                "check_execution_environment(): failed to look up internal DNS \
+                 in the internal DNS servers.\n\n \
+                 Are you trying to run this in a development environment?  \
+                 This test can only be run on deployed systems and only from a \
+                 context with connectivity to the underlay network.\n\n \
+                 raw error: {}",
+                 slog_error_chain::InlineErrorChain::new(&e)
+                );
+            anyhow!("{}", textwrap::wrap(&text, 80).join("\n"))
+        },
+    )
+}
+
+async fn check_hardware_environment(
     opctx: &OpContext,
     datastore: &DataStore,
 ) -> Result<(), anyhow::Error> {

From 340fc220706527c09a19dd92933e0df8dea25156 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 21 Aug 2024 10:35:14 -0700
Subject: [PATCH 05/24] Bundle up working directory instead of using git
 archive (much smaller, more accurate tarball)

---
 dev-tools/xtask/src/live_test.rs          | 87 ++++++++++++++---------
 live-tests/tests/common/mod.rs            |  2 -
 live-tests/tests/test_nexus_add_remove.rs |  3 +-
 3 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/dev-tools/xtask/src/live_test.rs b/dev-tools/xtask/src/live_test.rs
index 1f0fd22bc5..0837814418 100644
--- a/dev-tools/xtask/src/live_test.rs
+++ b/dev-tools/xtask/src/live_test.rs
@@ -12,6 +12,8 @@ use std::process::Command;
 pub struct Args {}
 
 pub fn run_cmd(_args: Args) -> Result<()> {
+    const NAME: &str = "live-test-archive";
+
     // The live tests operate in deployed environments, which always run
     // illumos.  Bail out quickly if someone tries to run this on a system whose
     // binaries won't be usable.  (We could compile this subcommand out
@@ -22,25 +24,25 @@ pub fn run_cmd(_args: Args) -> Result<()> {
         bail!("live-test archive can only be built on illumos systems");
     }
 
-    let tmpdir =
+    let tmpdir_root =
         camino_tempfile::tempdir().context("creating temporary directory")?;
-    let tarball = camino::Utf8PathBuf::try_from(
+    let final_tarball = camino::Utf8PathBuf::try_from(
         std::env::current_dir()
             .map(|d| d.join("target"))
             .context("getting current directory")?,
     )
     .context("non-UTF-8 current directory")?
-    .join("live-test-archive.tgz");
-    let contents = tmpdir.path().join("live-test-archive");
-    let archive_file = contents.join("omicron-live-tests.tar.zst");
+    .join(format!("{}.tgz", NAME));
+    let proto_root = tmpdir_root.path().join(NAME);
+    let nextest_archive_file = proto_root.join("omicron-live-tests.tar.zst");
 
-    eprintln!("using temporary directory: {}", tmpdir.path());
-    eprintln!("will create archive file:  {}", archive_file);
-    eprintln!("output tarball:            {}", tarball);
+    eprintln!("using temporary directory: {}", tmpdir_root.path());
+    eprintln!("will create archive file:  {}", nextest_archive_file);
+    eprintln!("output tarball:            {}", final_tarball);
     eprintln!();
 
-    std::fs::create_dir(&contents)
-        .with_context(|| format!("mkdir {:?}", &contents))?;
+    std::fs::create_dir(&proto_root)
+        .with_context(|| format!("mkdir {:?}", &proto_root))?;
 
     let cargo =
         std::env::var("CARGO").unwrap_or_else(|_| String::from("cargo"));
@@ -51,39 +53,56 @@ pub fn run_cmd(_args: Args) -> Result<()> {
     command.arg("--package");
     command.arg("omicron-live-tests");
     command.arg("--archive-file");
-    command.arg(&archive_file);
+    command.arg(&nextest_archive_file);
     run_subcmd(command)?;
 
-    // Bundle up the source.
-    // XXX-dap using git here is debatable.
-    // XXX-dap consider adding nextest binary
-    let mut command = Command::new("git");
-    command.arg("archive");
-    command.arg("--format=tar.gz");
-    command.arg("--prefix=live-test-archive/");
-    command.arg(format!("--add-file={}", &archive_file));
-    command.arg("HEAD");
-    let archive_stream = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .truncate(true)
-        .open(&tarball)
-        .with_context(|| format!("open {:?}", tarball))?;
-    command.stdout(archive_stream);
+    // Using nextest archives requires that the source be separately transmitted
+    // to the system where the tests will be run.  We're trying to automate
+    // that.  So let's bundle up the source and the nextest archive into one big
+    // tarball.  But which source files do we bundle?  We need:
+    //
+    // - Cargo.toml (nextest expects to find this)
+    // - .config/nextest.toml (nextest's configuration, which is used while
+    //   running the tests)
+    // - live-tests (this is where the tests live, and they might expect stuff
+    //   that exists in here like expectorate files)
+    //
+    // plus the nextext archive file.
+    //
+    // To avoid creating a tarbomb, we want all the files prefixed with
+    // "live-test-archive/".  There's no great way to do this with the illumos
+    // tar(1) except to create a temporary directory called "live-test-archive"
+    // that contains the files and then tar'ing up that.
+    //
+    // Ironically, an easy way to construct that directory is with tar(1).
+    let mut command = Command::new("bash");
+    command.arg("-c");
+    command.arg(format!(
+        "tar cf - Cargo.toml .config/nextest.toml live-tests | \
+         tar xf - -C {:?}",
+        &proto_root
+    ));
+    run_subcmd(command)?;
 
+    let mut command = Command::new("tar");
+    command.arg("cf");
+    command.arg(&final_tarball);
+    command.arg("-C");
+    command.arg(tmpdir_root.path());
+    command.arg(NAME);
     run_subcmd(command)?;
 
-    drop(tmpdir);
+    drop(tmpdir_root);
 
     eprint!("created: ");
-    println!("{}", &tarball);
+    println!("{}", &final_tarball);
     eprintln!("\nTo use this:\n");
     eprintln!(
         "1. Copy the tarball to the switch zone in a deployed Omicron system.\n"
     );
     let raw = &[
         "scp \\",
-        &format!("{} \\", &tarball),
+        &format!("{} \\", &final_tarball),
         "root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root",
     ]
     .join("\n");
@@ -109,16 +128,16 @@ pub fn run_cmd(_args: Args) -> Result<()> {
     );
     eprintln!("{}\n", text.join("\n"));
     eprintln!("3. On that system, unpack the tarball with:\n");
-    eprintln!("     tar xzf {}\n", tarball.file_name().unwrap());
+    eprintln!("     tar xzf {}\n", final_tarball.file_name().unwrap());
     eprintln!("4. On that system, run tests with:\n");
     let raw = &[
         "./cargo-nextest nextest run \\",
         &format!(
             "--archive-file {}/{} \\",
-            contents.file_name().unwrap(),
-            archive_file.file_name().unwrap()
+            NAME,
+            nextest_archive_file.file_name().unwrap()
         ),
-        &format!("--workspace-remap {}", contents.file_name().unwrap()),
+        &format!("--workspace-remap {}", NAME),
     ]
     .join("\n");
     let text = textwrap::wrap(
diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index bec0202fad..2ba86a5ac4 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -29,8 +29,6 @@ impl LiveTestContext {
     pub async fn new(
         test_name: &'static str,
     ) -> Result<LiveTestContext, anyhow::Error> {
-        // XXX-dap check that we're actually running inside an environment and
-        // not in some workspace
         let logctx = omicron_test_utils::dev::test_setup_log(test_name);
         let log = &logctx.log;
         let resolver = create_resolver(log)?;
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
index d969ae54c1..d945f391f2 100644
--- a/live-tests/tests/test_nexus_add_remove.rs
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -106,8 +106,7 @@ async fn test_nexus_add_remove() {
         .next()
         .expect("at least one sled with added zones")
         .zones
-        .iter()
-        .next()
+        .first()
         .expect("at least one added zone on that sled");
     assert_eq!(new_zone.kind(), ZoneKind::Nexus);
     let new_zone_addr = new_zone.underlay_address();

From 7c2c50c63a5b1b802920613a85135dcbbe9618b0 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 21 Aug 2024 11:31:06 -0700
Subject: [PATCH 06/24] check TMPDIR

---
 dev-tools/xtask/src/live_test.rs |  5 +++-
 live-tests/tests/common/mod.rs   | 44 ++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/dev-tools/xtask/src/live_test.rs b/dev-tools/xtask/src/live_test.rs
index 0837814418..05b2ab043f 100644
--- a/dev-tools/xtask/src/live_test.rs
+++ b/dev-tools/xtask/src/live_test.rs
@@ -130,8 +130,11 @@ pub fn run_cmd(_args: Args) -> Result<()> {
     eprintln!("3. On that system, unpack the tarball with:\n");
     eprintln!("     tar xzf {}\n", final_tarball.file_name().unwrap());
     eprintln!("4. On that system, run tests with:\n");
+    // TMPDIR=/var/tmp puts stuff on disk, cached as needed, rather than the
+    // default /tmp which requires that stuff be in-memory.  That can lead to
+    // great sadness if the tests wind up writing a lot of data.
     let raw = &[
-        "./cargo-nextest nextest run \\",
+        "TMPDIR=/var/tmp ./cargo-nextest nextest run \\",
         &format!(
             "--archive-file {}/{} \\",
             NAME,
diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index 2ba86a5ac4..5765edc98f 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -13,11 +13,11 @@ use nexus_types::deployment::SledFilter;
 use omicron_common::address::Ipv6Subnet;
 use slog::info;
 use slog::o;
+use std::ffi::OsStr;
 use std::net::SocketAddrV6;
+use std::path::Component;
 use std::sync::Arc;
 
-// XXX-dap point tmp at some place other than /tmp
-
 pub struct LiveTestContext {
     logctx: LogContext,
     opctx: OpContext,
@@ -169,20 +169,44 @@ async fn check_execution_environment(
         "live tests can only be run on deployed systems, which run illumos"
     );
 
-    resolver.lookup_ip(ServiceName::InternalDns).await.map(|_| ()).map_err(
-        |e| {
-            let text = format!(
-                "check_execution_environment(): failed to look up internal DNS \
+    resolver.lookup_ip(ServiceName::InternalDns).await.map_err(|e| {
+        let text = format!(
+            "check_execution_environment(): failed to look up internal DNS \
                  in the internal DNS servers.\n\n \
                  Are you trying to run this in a development environment?  \
                  This test can only be run on deployed systems and only from a \
                  context with connectivity to the underlay network.\n\n \
                  raw error: {}",
-                 slog_error_chain::InlineErrorChain::new(&e)
+            slog_error_chain::InlineErrorChain::new(&e)
+        );
+        anyhow!("{}", textwrap::wrap(&text, 80).join("\n"))
+    })?;
+
+    // Warn the user if the temporary directory is /tmp.  This check is
+    // heuristic.  There are other ways they may have specified a tmpfs
+    // temporary directory and we don't claim to catch all of them.
+    //
+    // We could also just go ahead and use /var/tmp, but it's not clear we can
+    // reliably do that at this point (if Rust or other components have cached
+    // TMPDIR) and it would be hard to override.
+    let tmpdir = std::env::temp_dir();
+    let mut tmpdir_components = tmpdir.components().take(2);
+    if let Some(first) = tmpdir_components.next() {
+        if let Some(next) = tmpdir_components.next() {
+            if first == Component::RootDir
+                && next == Component::Normal(OsStr::new("tmp"))
+            {
+                eprintln!(
+                    "WARNING: temporary directory appears to be under /tmp, \
+                     which is generally tmpfs.  Consider setting \
+                     TMPDIR=/var/tmp to avoid runaway tests using too much\
+                     memory and swap."
                 );
-            anyhow!("{}", textwrap::wrap(&text, 80).join("\n"))
-        },
-    )
+            }
+        }
+    }
+
+    Ok(())
 }
 
 async fn check_hardware_environment(

From 210e4f6632b27c4f5771c89f87a229e0f8cea6a7 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 11:40:19 -0700
Subject: [PATCH 07/24] these tests need to run serially

---
 .config/nextest.toml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.config/nextest.toml b/.config/nextest.toml
index 2d3660f9a4..a294bcc18b 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -21,18 +21,26 @@ fail-fast = false
 # invocations of nextest happen.
 command = 'cargo run -p crdb-seed --profile test'
 
+[test-groups]
 # The ClickHouse cluster tests currently rely on a hard-coded set of ports for
 # the nodes in the cluster. We would like to relax this in the future, at which
 # point this test-group configuration can be removed or at least loosened to
 # support testing in parallel. For now, enforce strict serialization for all
 # tests with `replicated` in the name.
-[test-groups]
 clickhouse-cluster = { max-threads = 1 }
+# While most Omicron tests operate with their own simulated control plane, the
+# live-tests operate on a more realistic, shared control plane and test
+# behaviors that conflict with each other.  They need to be run serially.
+live-tests = { max-threads = 1 }
 
 [[profile.default.overrides]]
 filter = 'package(oximeter-db) and test(replicated)'
 test-group = 'clickhouse-cluster'
 
+[[profile.default.overrides]]
+filter = 'package(omicron-live-tests)'
+test-group = 'live-tests'
+
 [[profile.default.overrides]]
 # These tests can time out under heavy contention.
 filter = 'binary_id(omicron-nexus::test_all) and test(::schema::)'

From 365d13e92c2eef3d806e154c23d47441cb5cb7bb Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 11:40:47 -0700
Subject: [PATCH 08/24] commonize some code

---
 live-tests/tests/common/mod.rs            |   2 +
 live-tests/tests/common/reconfigurator.rs |  81 ++++++++++++
 live-tests/tests/test_nexus_add_remove.rs | 143 ++++++++++------------
 3 files changed, 145 insertions(+), 81 deletions(-)
 create mode 100644 live-tests/tests/common/reconfigurator.rs

diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index 5765edc98f..13bf71a4b5 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -2,6 +2,8 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
+pub mod reconfigurator;
+
 use anyhow::{anyhow, bail, ensure, Context};
 use dropshot::test_util::LogContext;
 use internal_dns::resolver::Resolver;
diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs
new file mode 100644
index 0000000000..7d8ceca46f
--- /dev/null
+++ b/live-tests/tests/common/reconfigurator.rs
@@ -0,0 +1,81 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Helpers common to Reconfigurator tests
+
+use anyhow::{ensure, Context};
+use nexus_client::types::BlueprintTargetSet;
+use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
+use nexus_types::deployment::{Blueprint, PlanningInput};
+use slog::{debug, info};
+
+pub async fn blueprint_edit_current_target(
+    log: &slog::Logger,
+    planning_input: &PlanningInput,
+    nexus: &nexus_client::Client,
+    edit_fn: &dyn Fn(&mut BlueprintBuilder) -> Result<(), anyhow::Error>,
+) -> Result<(Blueprint, Blueprint), anyhow::Error> {
+    // Fetch the current target configuration.
+    info!(log, "editing current target blueprint");
+    let target_blueprint = nexus
+        .blueprint_target_view()
+        .await
+        .context("fetch current target config")?
+        .into_inner();
+    debug!(log, "found current target blueprint";
+        "blueprint_id" => %target_blueprint.target_id
+    );
+    ensure!(
+        target_blueprint.enabled,
+        "refusing to modify a system with target blueprint disabled"
+    );
+
+    // Fetch the actual blueprint.
+    let blueprint1 = nexus
+        .blueprint_view(&target_blueprint.target_id)
+        .await
+        .context("fetch current target blueprint")?
+        .into_inner();
+    debug!(log, "fetched current target blueprint";
+        "blueprint_id" => %target_blueprint.target_id
+    );
+
+    // Make a new builder based on that blueprint and use `edit_fn` to edit it.
+    let mut builder = BlueprintBuilder::new_based_on(
+        log,
+        &blueprint1,
+        &planning_input,
+        "test-suite",
+    )
+    .context("creating BlueprintBuilder")?;
+
+    edit_fn(&mut builder)?;
+
+    // Assemble the new blueprint, import it, and make it the new target.
+    let blueprint2 = builder.build();
+    info!(log, "assembled new blueprint based on target";
+        "current_target_id" => %target_blueprint.target_id,
+        "new_blueprint_id" => %blueprint2.id,
+    );
+    nexus
+        .blueprint_import(&blueprint2)
+        .await
+        .context("importing new blueprint")?;
+    debug!(log, "imported new blueprint";
+        "blueprint_id" => %blueprint2.id,
+    );
+    nexus
+        .blueprint_target_set(&BlueprintTargetSet {
+            enabled: true,
+            target_id: blueprint2.id,
+        })
+        .await
+        .expect("setting new target");
+    info!(log, "finished editing target blueprint";
+        "old_target_id" => %blueprint1.id,
+        "new_target_id" => %blueprint2.id,
+    );
+
+    Ok((blueprint1, blueprint2))
+}
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
index d945f391f2..bdfc8343af 100644
--- a/live-tests/tests/test_nexus_add_remove.rs
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -6,9 +6,9 @@ mod common;
 
 use anyhow::Context;
 use assert_matches::assert_matches;
+use common::reconfigurator::blueprint_edit_current_target;
 use common::LiveTestContext;
 use futures::TryStreamExt;
-use nexus_client::types::BlueprintTargetSet;
 use nexus_client::types::Saga;
 use nexus_client::types::SagaState;
 use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
@@ -25,77 +25,57 @@ use std::time::Duration;
 
 // XXX-dap clean up in general
 // XXX-dap clean up logging
+// XXX-dap make a macro for LiveTestContext
+// TODO-coverage This test could check other stuff:
+//
+// - that after adding:
+//   - the new Nexus appears in external DNS
+//   - we can _use_ the new Nexus from the outside
+//     (e.g., using an `oxide_client` using a custom reqwest resolver that
+//     points only at that one IP so that we can make sure we're always getting
+//     that one)
+// - that after expungement:
+//   - we can't reach it any more on the underlay
+//   - it doesn't appear in external DNS any more
+//
 #[tokio::test]
 async fn test_nexus_add_remove() {
+    // Test setup
     let lc = LiveTestContext::new("test_nexus_add_remove").await.unwrap();
     let log = lc.log();
     let mylog = lc.log().new(o!("top-level" => true));
-
     let nexus =
         lc.any_internal_nexus_client().await.expect("internal Nexus client");
-    info!(mylog, "have Nexus client");
-
-    // Fetch the current target blueprint.
-    let target_blueprint = nexus
-        .blueprint_target_view()
-        .await
-        .expect("fetch target config")
-        .into_inner();
-    if !target_blueprint.enabled {
-        panic!("refusing to modify a system with target blueprint disabled");
-    }
-
-    let blueprint1 = nexus
-        .blueprint_view(&target_blueprint.target_id)
-        .await
-        .expect("fetch target blueprint")
-        .into_inner();
-    info!(mylog, "have current blueprint");
-
-    // Assemble a new blueprint based on the current target that adds a Nexus
-    // zone on some sled.
     let opctx = lc.opctx();
     let datastore = lc.datastore();
     let planning_input = PlanningInputFromDb::assemble(&opctx, &datastore)
         .await
         .expect("planning input");
-    info!(mylog, "have PlanningInput");
-
-    let mut builder = BlueprintBuilder::new_based_on(
-        log,
-        &blueprint1,
-        &planning_input,
-        "test-suite",
-    )
-    .expect("BlueprintBuilder");
 
+    // First, deploy a new Nexus zone to an arbitrary sled.
     let sled_id = planning_input
         .all_sled_ids(SledFilter::Commissioned)
         .next()
         .expect("any sled id");
-
-    let nnexus =
-        builder.sled_num_running_zones_of_kind(sled_id, ZoneKind::Nexus);
-    let count = builder
-        .sled_ensure_zone_multiple_nexus(sled_id, nnexus + 1)
-        .expect("adding Nexus");
-    assert_matches!(count, EnsureMultiple::Changed { added: 1, removed: 0 });
-    let blueprint2 = builder.build();
-    info!(mylog, "built new blueprint";
-        "blueprint1_id" => %blueprint1.id,
-        "blueprint2_id" => %blueprint2.id
-    );
-
-    // Make this the new target.
-    nexus.blueprint_import(&blueprint2).await.expect("importing new blueprint");
-    nexus
-        .blueprint_target_set(&BlueprintTargetSet {
-            enabled: true,
-            target_id: blueprint2.id,
-        })
-        .await
-        .expect("setting new target");
-    info!(mylog, "imported blueprint"; "blueprint_id" => %blueprint2.id);
+    let (blueprint1, blueprint2) = blueprint_edit_current_target(
+        log,
+        &planning_input,
+        &nexus,
+        &|builder: &mut BlueprintBuilder| {
+            let nnexus = builder
+                .sled_num_running_zones_of_kind(sled_id, ZoneKind::Nexus);
+            let count = builder
+                .sled_ensure_zone_multiple_nexus(sled_id, nnexus + 1)
+                .context("adding Nexus zone")?;
+            assert_matches!(
+                count,
+                EnsureMultiple::Changed { added: 1, removed: 0 }
+            );
+            Ok(())
+        },
+    )
+    .await
+    .expect("editing blueprint to add zone");
 
     // Figure out which zone is new and make a new client for it.
     let diff = blueprint2.diff_since_blueprint(&blueprint1);
@@ -132,39 +112,40 @@ async fn test_nexus_add_remove() {
     assert!(initial_sagas_list.is_empty());
     info!(mylog, "new Nexus is online");
 
-    // Create a demo saga.
+    // Create a demo saga from the new Nexus zone.  We'll use this to test that
+    // when the zone is expunged, its saga gets moved to a different Nexus.
     let demo_saga = new_zone_client
         .saga_demo_create()
         .await
         .expect("new Nexus saga demo create");
     let saga_id = demo_saga.saga_id;
-
     let sagas_list =
         list_sagas(&new_zone_client).await.expect("new Nexus sagas_list");
     assert_eq!(sagas_list.len(), 1);
     assert_eq!(sagas_list[0].id, saga_id);
-    info!(mylog, "ready to expunge");
 
-    let mut builder = BlueprintBuilder::new_based_on(
+    // Now expunge the zone we just created.
+    info!(mylog, "ready to expunge");
+    let _ = blueprint_edit_current_target(
         log,
-        &blueprint2,
         &planning_input,
-        "test-suite",
+        &nexus,
+        &|builder: &mut BlueprintBuilder| {
+            builder
+                .sled_expunge_zone(sled_id, new_zone.id())
+                .context("expunging zone")
+        },
     )
-    .expect("BlueprintBuilder");
-    builder.sled_expunge_zone(sled_id, new_zone.id()).expect("expunge zone");
-    let blueprint3 = builder.build();
-    nexus.blueprint_import(&blueprint3).await.expect("importing new blueprint");
-    nexus
-        .blueprint_target_set(&BlueprintTargetSet {
-            enabled: true,
-            target_id: blueprint3.id,
-        })
-        .await
-        .expect("setting new target");
-    info!(mylog, "imported blueprint with Nexus expunged and set target";
-        "blueprint_id" => %blueprint3.id
-    );
+    .await
+    .expect("editing blueprint to expunge zone");
+
+    // XXX-dap first, wait for the expunged instance to actually go away.
+    // Otherwise, it's possible we find that one in
+    // `all_internal_nexus_clients()` and complete the saga before the zone gets
+    // expunged!
+    //
+    // XXX-dap can we check the URL on the client to make sure it's a different
+    // one?
 
     // Wait for some other Nexus instance to pick up the saga.
     let nexus_clients = lc.all_internal_nexus_clients().await.unwrap();
@@ -188,16 +169,15 @@ async fn test_nexus_add_remove() {
     )
     .await
     .unwrap();
+    info!(mylog, "found saga in a different Nexus instance");
 
-    info!(mylog, "found saga in previous Nexus instance");
-    // Now, complete it on this instance.
+    // Now, complete the demo saga on whichever instance is running it now.
+    // `saga_demo_complete` is not synchronous.  It just unblocks the saga.
+    // We'll need to poll a bit to wait for it to finish.
     nexus_found
         .saga_demo_complete(&demo_saga.demo_saga_id)
         .await
         .expect("complete demo saga");
-
-    // Completion is not synchronous -- that just unblocked the saga.  So we
-    // need to poll a bit to wait for it to actually finish.
     let found = wait_for_condition(
         || async {
             let sagas = list_sagas(&nexus_found).await.expect("listing sagas");
@@ -218,7 +198,8 @@ async fn test_nexus_add_remove() {
     assert_eq!(found.id, saga_id);
     assert!(matches!(found.state, SagaState::Succeeded));
 
-    lc.cleanup_successful();
+    // XXX-dap
+    // lc.cleanup_successful();
 }
 
 async fn list_sagas(

From 4712076f18bad7e3a4e0c87a5535c6a825cef38c Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 12:26:27 -0700
Subject: [PATCH 09/24] DataStore::new_failfast

---
 live-tests/tests/common/mod.rs            | 34 +++-------------------
 live-tests/tests/test_nexus_add_remove.rs |  3 +-
 nexus/db-queries/src/db/datastore/mod.rs  | 35 ++++++++++++++++++++++-
 3 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index 13bf71a4b5..96cc0d4438 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -4,7 +4,7 @@
 
 pub mod reconfigurator;
 
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, ensure, Context};
 use dropshot::test_util::LogContext;
 use internal_dns::resolver::Resolver;
 use internal_dns::ServiceName;
@@ -131,36 +131,10 @@ async fn create_datastore(
 
     let db_config = nexus_db_queries::db::Config { url };
     let pool = Arc::new(nexus_db_queries::db::Pool::new(log, &db_config));
-    let datastore = DataStore::new_unchecked(log.clone(), pool)
-        .map_err(|s| anyhow!("creating DataStore: {s}"))?;
-
-    // XXX-dap TODO-cleanup put all this into a Datastore::new_nowait() or
-    // something
-    let expected_version = nexus_db_model::SCHEMA_VERSION;
-    let (found_version, found_target) = datastore
-        .database_schema_version()
+    DataStore::new_failfast(log, pool)
         .await
-        .context("loading database schema version")?;
-    eprintln!(
-        "create_datastore(): found_version {found_version}, \
-        found_target {found_target:?}"
-    );
-
-    if let Some(found_target) = found_target {
-        bail!(
-            "database schema check failed: apparently mid-upgrade \
-            (found_target = {found_target})"
-        );
-    }
-
-    if found_version != expected_version {
-        bail!(
-            "database schema check failed: \
-            expected {expected_version}, found {found_version}",
-        );
-    }
-
-    Ok(Arc::new(datastore))
+        .context("creating DataStore")
+        .map(Arc::new)
 }
 
 async fn check_execution_environment(
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
index bdfc8343af..47615fcc65 100644
--- a/live-tests/tests/test_nexus_add_remove.rs
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -198,8 +198,7 @@ async fn test_nexus_add_remove() {
     assert_eq!(found.id, saga_id);
     assert!(matches!(found.state, SagaState::Succeeded));
 
-    // XXX-dap
-    // lc.cleanup_successful();
+    lc.cleanup_successful();
 }
 
 async fn list_sagas(
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index 13c3708e4a..c1cf1a0a3c 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -27,6 +27,7 @@ use crate::db::{
     error::{public_error_from_diesel, ErrorHandler},
 };
 use ::oximeter::types::ProducerRegistry;
+use anyhow::{anyhow, bail, Context};
 use async_bb8_diesel::{AsyncRunQueryDsl, ConnectionManager};
 use diesel::pg::Pg;
 use diesel::prelude::*;
@@ -207,7 +208,7 @@ impl DataStore {
 
     /// Constructs a new Datastore object.
     ///
-    /// Only returns if the database schema is compatible with Nexus's known
+    /// Only returns when the database schema is compatible with Nexus's known
     /// schema version.
     pub async fn new(
         log: &Logger,
@@ -241,6 +242,38 @@ impl DataStore {
         Ok(datastore)
     }
 
+    /// Constructs a new Datastore, failing if the schema version does not match
+    /// this program's expected version
+    pub async fn new_failfast(
+        log: &Logger,
+        pool: Arc<Pool>,
+    ) -> Result<Self, anyhow::Error> {
+        let datastore =
+            Self::new_unchecked(log.new(o!("component" => "datastore")), pool)
+                .map_err(|e| anyhow!("{}", e))?;
+        const EXPECTED_VERSION: SemverVersion = nexus_db_model::SCHEMA_VERSION;
+        let (found_version, found_target) = datastore
+            .database_schema_version()
+            .await
+            .context("loading database schema version")?;
+
+        if let Some(found_target) = found_target {
+            bail!(
+                "database schema check failed: apparently mid-upgrade \
+                 (found_target = {found_target})"
+            );
+        }
+
+        if found_version != EXPECTED_VERSION {
+            bail!(
+                "database schema check failed: \
+                 expected {EXPECTED_VERSION}, found {found_version}",
+            );
+        }
+
+        Ok(datastore)
+    }
+
     pub fn register_producers(&self, registry: &ProducerRegistry) {
         registry
             .register_producer(

From a27d14df7c925ffdd826d76cd12b3f7ba364369c Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 12:31:08 -0700
Subject: [PATCH 10/24] commonize run_subcmd in xtask

---
 dev-tools/xtask/src/clippy.rs    | 25 +++--------------------
 dev-tools/xtask/src/common.rs    | 34 ++++++++++++++++++++++++++++++++
 dev-tools/xtask/src/live_test.rs | 26 +-----------------------
 dev-tools/xtask/src/main.rs      |  1 +
 4 files changed, 39 insertions(+), 47 deletions(-)
 create mode 100644 dev-tools/xtask/src/common.rs

diff --git a/dev-tools/xtask/src/clippy.rs b/dev-tools/xtask/src/clippy.rs
index 7924a05574..229d0e126e 100644
--- a/dev-tools/xtask/src/clippy.rs
+++ b/dev-tools/xtask/src/clippy.rs
@@ -4,7 +4,8 @@
 
 //! Subcommand: cargo xtask clippy
 
-use anyhow::{bail, Context, Result};
+use crate::common::run_subcmd;
+use anyhow::Result;
 use clap::Parser;
 use std::process::Command;
 
@@ -51,25 +52,5 @@ pub fn run_cmd(args: ClippyArgs) -> Result<()> {
         .arg("--deny")
         .arg("warnings");
 
-    eprintln!(
-        "running: {:?} {}",
-        &cargo,
-        command
-            .get_args()
-            .map(|arg| format!("{:?}", arg.to_str().unwrap()))
-            .collect::<Vec<_>>()
-            .join(" ")
-    );
-
-    let exit_status = command
-        .spawn()
-        .context("failed to spawn child process")?
-        .wait()
-        .context("failed to wait for child process")?;
-
-    if !exit_status.success() {
-        bail!("clippy failed: {}", exit_status);
-    }
-
-    Ok(())
+    run_subcmd(command)
 }
diff --git a/dev-tools/xtask/src/common.rs b/dev-tools/xtask/src/common.rs
new file mode 100644
index 0000000000..03b17a560f
--- /dev/null
+++ b/dev-tools/xtask/src/common.rs
@@ -0,0 +1,34 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Common xtask command helpers
+
+use anyhow::{bail, Context, Result};
+use std::process::Command;
+
+/// Runs the given command, printing some basic debug information around it, and
+/// failing with an error message if the command does not exit successfully
+pub fn run_subcmd(mut command: Command) -> Result<()> {
+    eprintln!(
+        "running: {} {}",
+        command.get_program().to_str().unwrap(),
+        command
+            .get_args()
+            .map(|arg| format!("{:?}", arg.to_str().unwrap()))
+            .collect::<Vec<_>>()
+            .join(" ")
+    );
+
+    let exit_status = command
+        .spawn()
+        .context("failed to spawn child process")?
+        .wait()
+        .context("failed to wait for child process")?;
+
+    if !exit_status.success() {
+        bail!("failed: {}", exit_status);
+    }
+
+    Ok(())
+}
diff --git a/dev-tools/xtask/src/live_test.rs b/dev-tools/xtask/src/live_test.rs
index 05b2ab043f..d334ff91bb 100644
--- a/dev-tools/xtask/src/live_test.rs
+++ b/dev-tools/xtask/src/live_test.rs
@@ -4,6 +4,7 @@
 
 //! Subcommand: cargo xtask live-test
 
+use crate::common::run_subcmd;
 use anyhow::{bail, Context, Result};
 use clap::Parser;
 use std::process::Command;
@@ -153,28 +154,3 @@ pub fn run_cmd(_args: Args) -> Result<()> {
 
     Ok(())
 }
-
-// XXX-dap commonize with clippy
-fn run_subcmd(mut command: Command) -> Result<()> {
-    eprintln!(
-        "running: {} {}",
-        command.get_program().to_str().unwrap(),
-        command
-            .get_args()
-            .map(|arg| format!("{:?}", arg.to_str().unwrap()))
-            .collect::<Vec<_>>()
-            .join(" ")
-    );
-
-    let exit_status = command
-        .spawn()
-        .context("failed to spawn child process")?
-        .wait()
-        .context("failed to wait for child process")?;
-
-    if !exit_status.success() {
-        bail!("failed: {}", exit_status);
-    }
-
-    Ok(())
-}
diff --git a/dev-tools/xtask/src/main.rs b/dev-tools/xtask/src/main.rs
index 0d61c1035c..ffd1d48b29 100644
--- a/dev-tools/xtask/src/main.rs
+++ b/dev-tools/xtask/src/main.rs
@@ -16,6 +16,7 @@ use std::process::Command;
 mod check_features;
 mod check_workspace_deps;
 mod clippy;
+mod common;
 #[cfg_attr(not(target_os = "illumos"), allow(dead_code))]
 mod external;
 mod live_test;

From c4f9ee71280d5b9482818e5d17d906e30c2b1276 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 12:42:36 -0700
Subject: [PATCH 11/24] write some docs

---
 live-tests/tests/common/mod.rs            | 65 +++++++++++++++++------
 live-tests/tests/common/reconfigurator.rs | 22 ++++++++
 2 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index 96cc0d4438..787161a692 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -20,6 +20,8 @@ use std::net::SocketAddrV6;
 use std::path::Component;
 use std::sync::Arc;
 
+/// Contains data and interfaces useful for running tests against an existing
+/// deployed control plane
 pub struct LiveTestContext {
     logctx: LogContext,
     opctx: OpContext,
@@ -28,6 +30,7 @@ pub struct LiveTestContext {
 }
 
 impl LiveTestContext {
+    /// Make a new `LiveTestContext` for a test called `test_name`.
     pub async fn new(
         test_name: &'static str,
     ) -> Result<LiveTestContext, anyhow::Error> {
@@ -41,14 +44,32 @@ impl LiveTestContext {
         Ok(LiveTestContext { logctx, opctx, resolver, datastore })
     }
 
+    /// Clean up this `LiveTestContext`
+    ///
+    /// This mainly removes log files created by the test.  We do this in this
+    /// explicit cleanup function rather than on `Drop` because we want the log
+    /// files preserved on test failure.
     pub fn cleanup_successful(self) {
         self.logctx.cleanup_successful();
     }
 
+    /// Returns a logger suitable for use in the test
     pub fn log(&self) -> &slog::Logger {
         &self.logctx.log
     }
 
+    /// Returns an `OpContext` suitable for use in tests
+    pub fn opctx(&self) -> &OpContext {
+        &self.opctx
+    }
+
+    /// Returns a `DataStore` pointing at this deployed system's database
+    pub fn datastore(&self) -> &DataStore {
+        &self.datastore
+    }
+
+    /// Returns a client for any Nexus's internal API (based on the Nexus
+    /// instances that are in DNS)
     pub async fn any_internal_nexus_client(
         &self,
     ) -> Result<nexus_client::Client, anyhow::Error> {
@@ -60,6 +81,7 @@ impl LiveTestContext {
         Ok(self.specific_internal_nexus_client(sockaddr))
     }
 
+    /// Returns a client for a Nexus internal API at the given socket address
     pub fn specific_internal_nexus_client(
         &self,
         sockaddr: SocketAddrV6,
@@ -69,6 +91,8 @@ impl LiveTestContext {
         nexus_client::Client::new(&url, log)
     }
 
+    /// Returns a list of clients for the internal APIs for all Nexus instances
+    /// found in DNS
     pub async fn all_internal_nexus_clients(
         &self,
     ) -> Result<Vec<nexus_client::Client>, anyhow::Error> {
@@ -81,26 +105,17 @@ impl LiveTestContext {
             .map(|s| self.specific_internal_nexus_client(s))
             .collect())
     }
-
-    pub fn opctx(&self) -> &OpContext {
-        &self.opctx
-    }
-
-    pub fn datastore(&self) -> &DataStore {
-        &self.datastore
-    }
 }
 
 fn create_resolver(log: &slog::Logger) -> Result<Resolver, anyhow::Error> {
-    // In principle, we should look at /etc/resolv.conf to find the
-    // DNS servers.  In practice, this usually isn't populated
-    // today.  See oxidecomputer/omicron#2122.
+    // In principle, we should look at /etc/resolv.conf to find the DNS servers.
+    // In practice, this usually isn't populated today.  See
+    // oxidecomputer/omicron#2122.
     //
-    // However, the address selected below should work for most
-    // existing Omicron deployments today.  That's because while the
-    // base subnet is in principle configurable in config-rss.toml,
-    // it's very uncommon to change it from the default value used
-    // here.
+    // However, the address selected below should work for most existing Omicron
+    // deployments today.  That's because while the base subnet is in principle
+    // configurable in config-rss.toml, it's very uncommon to change it from the
+    // default value used here.
     let subnet = Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap());
     eprintln!("note: using DNS server for subnet {}", subnet.net());
     internal_dns::resolver::Resolver::new_from_subnet(log.clone(), subnet)
@@ -109,6 +124,7 @@ fn create_resolver(log: &slog::Logger) -> Result<Resolver, anyhow::Error> {
         })
 }
 
+/// Creates a DataStore pointing at the CockroachDB cluster that's in DNS
 async fn create_datastore(
     log: &slog::Logger,
     resolver: &Resolver,
@@ -137,6 +153,11 @@ async fn create_datastore(
         .map(Arc::new)
 }
 
+/// Performs quick checks to determine if the user is running these tests in the
+/// wrong place and bails out if so
+///
+/// This isn't perfect but seeks to fail fast in obviously bogus environments
+/// that someone might accidentally try to run this in.
 async fn check_execution_environment(
     resolver: &Resolver,
 ) -> Result<(), anyhow::Error> {
@@ -145,6 +166,9 @@ async fn check_execution_environment(
         "live tests can only be run on deployed systems, which run illumos"
     );
 
+    // The only real requirement for these tests is that they're run from a
+    // place with connectivity to the underlay network of a deployed control
+    // plane.  The easiest way to tell is to look up something in internal DNS.
     resolver.lookup_ip(ServiceName::InternalDns).await.map_err(|e| {
         let text = format!(
             "check_execution_environment(): failed to look up internal DNS \
@@ -185,6 +209,15 @@ async fn check_execution_environment(
     Ok(())
 }
 
+/// Performs additional checks to determine if we're running in an environment
+/// that we believe is safe to run tests
+///
+/// These tests may make arbitrary modifications to the system.  We don't want
+/// to run this in dogfood or other pre-production or production environments.
+/// This function uses an allowlist of Oxide serials corresponding to test
+/// environments so that it never accidentally runs on a production system.
+///
+/// Non-Oxide hardware (e.g., PCs, a4x2, etc.) are always allowed.
 async fn check_hardware_environment(
     opctx: &OpContext,
     datastore: &DataStore,
diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs
index 7d8ceca46f..8f2560bb49 100644
--- a/live-tests/tests/common/reconfigurator.rs
+++ b/live-tests/tests/common/reconfigurator.rs
@@ -10,6 +10,28 @@ use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
 use nexus_types::deployment::{Blueprint, PlanningInput};
 use slog::{debug, info};
 
+/// Modify the system by editing the current target blueprint
+///
+/// More precisely, this function:
+///
+/// - fetches the current target blueprint
+/// - creates a new BlueprintBuilder based on it
+/// - invokes the caller's `edit_fn`, which may modify the builder however it
+///   likes
+/// - generates a new blueprint (thus based on the current target)
+/// - uploads the new blueprint
+/// - sets the new blueprint as the current target
+/// - enables the new blueprint
+///
+/// ## Errors
+///
+/// This function fails if the current target blueprint is not already enabled.
+/// That's because a disabled target blueprint means somebody doesn't want
+/// Reconfigurator running or doesn't want it using that blueprint.  We don't
+/// want the test to inadvertently override that behavior.  In a typical use
+/// case, a developer enables the initial target blueprint before running these
+/// tests and then doesn't need to think about it again for the lifetime of
+/// their test environment.
 pub async fn blueprint_edit_current_target(
     log: &slog::Logger,
     planning_input: &PlanningInput,

From 826b278c68a96d1f059d3a8087b081aadfdb3fd5 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 12:56:11 -0700
Subject: [PATCH 12/24] make it a macro

---
 Cargo.lock                                | 10 +++
 Cargo.toml                                |  5 +-
 live-tests/Cargo.toml                     |  1 +
 live-tests/macros/Cargo.toml              | 16 +++++
 live-tests/macros/src/lib.rs              | 80 +++++++++++++++++++++++
 live-tests/macros/src/main.rs             |  3 +
 live-tests/tests/test_nexus_add_remove.rs |  9 +--
 7 files changed, 117 insertions(+), 7 deletions(-)
 create mode 100644 live-tests/macros/Cargo.toml
 create mode 100644 live-tests/macros/src/lib.rs
 create mode 100644 live-tests/macros/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 84cfa67213..4ec7c77600 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4535,6 +4535,15 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
+[[package]]
+name = "live-tests-macros"
+version = "0.1.0"
+dependencies = [
+ "omicron-workspace-hack",
+ "quote",
+ "syn 2.0.74",
+]
+
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@@ -6006,6 +6015,7 @@ dependencies = [
  "dropshot",
  "futures",
  "internal-dns",
+ "live-tests-macros",
  "nexus-client",
  "nexus-config",
  "nexus-db-model",
diff --git a/Cargo.toml b/Cargo.toml
index 9f9e7aa5f4..817f4b7326 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,6 +52,7 @@ members = [
     "ipcc",
     "key-manager",
     "live-tests",
+    "live-tests/macros",
     "nexus",
     "nexus-config",
     "nexus-sled-agent-shared",
@@ -170,7 +171,8 @@ default-members = [
     "ipcc",
     "key-manager",
     # Do not include live-tests in the list of default members because its tests
-    # only work in a deployed system.
+    # only work in a deployed system.  The macros can be here, though.
+    "live-tests/macros",
     "nexus",
     "nexus-config",
     "nexus-sled-agent-shared",
@@ -405,6 +407,7 @@ libc = "0.2.156"
 libfalcon = { git = "https://github.com/oxidecomputer/falcon", rev = "e69694a1f7cc9fe31fab27f321017280531fb5f7" }
 libnvme = { git = "https://github.com/oxidecomputer/libnvme", rev = "dd5bb221d327a1bc9287961718c3c10d6bd37da0" }
 linear-map = "1.2.0"
+live-tests-macros = { path = "live-tests/macros" }
 macaddr = { version = "1.0.1", features = ["serde_std"] }
 maplit = "1.0.2"
 mockall = "0.13"
diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml
index 2a27e8f848..a68a492703 100644
--- a/live-tests/Cargo.toml
+++ b/live-tests/Cargo.toml
@@ -17,6 +17,7 @@ assert_matches.workspace = true
 dropshot.workspace = true
 futures.workspace = true
 internal-dns.workspace = true
+live-tests-macros.workspace = true
 nexus-client.workspace = true
 nexus-config.workspace = true
 nexus-db-model.workspace = true
diff --git a/live-tests/macros/Cargo.toml b/live-tests/macros/Cargo.toml
new file mode 100644
index 0000000000..81d094d926
--- /dev/null
+++ b/live-tests/macros/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "live-tests-macros"
+version = "0.1.0"
+edition = "2021"
+license = "MPL-2.0"
+
+[lib]
+proc-macro = true
+
+[lints]
+workspace = true
+
+[dependencies]
+quote.workspace = true
+syn = { workspace = true, features = [ "fold", "parsing" ] }
+omicron-workspace-hack.workspace = true
diff --git a/live-tests/macros/src/lib.rs b/live-tests/macros/src/lib.rs
new file mode 100644
index 0000000000..49abb5b6cf
--- /dev/null
+++ b/live-tests/macros/src/lib.rs
@@ -0,0 +1,80 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Macro to wrap a live test function that automatically creates and cleans up
+//! the `LiveTestContext`
+
+use proc_macro::TokenStream;
+use quote::quote;
+use syn::{parse_macro_input, ItemFn};
+
+/// Attribute for wrapping a test function to handle automatically creating and
+/// destroying a LiveTestContext. If the wrapped test fails, the context will
+/// intentionally not be cleaned up to support debugging.
+///
+/// Example usage:
+///
+/// ```ignore
+/// #[live_test]
+/// async fn test_my_test_case(lc: &mut LiveTestContext) {
+///   assert!(true);
+/// }
+/// ```
+///
+/// We use this instead of implementing Drop on LiveTestContext because we want
+/// the teardown to only happen when the test doesn't fail (which causes a panic
+/// and unwind).
+#[proc_macro_attribute]
+pub fn live_test(_attrs: TokenStream, input: TokenStream) -> TokenStream {
+    let input_func = parse_macro_input!(input as ItemFn);
+
+    let mut correct_signature = true;
+    if input_func.sig.variadic.is_some()
+        || input_func.sig.inputs.len() != 1
+        || input_func.sig.asyncness.is_none()
+    {
+        correct_signature = false;
+    }
+
+    // Verify we're returning an empty tuple
+    correct_signature &= match input_func.sig.output {
+        syn::ReturnType::Default => true,
+        syn::ReturnType::Type(_, ref t) => {
+            if let syn::Type::Tuple(syn::TypeTuple { elems, .. }) = &**t {
+                elems.is_empty()
+            } else {
+                false
+            }
+        }
+    };
+    if !correct_signature {
+        panic!("func signature must be async fn(&LiveTestContext)");
+    }
+
+    let func_ident_string = input_func.sig.ident.to_string();
+    let func_ident = input_func.sig.ident.clone();
+    let new_block = quote! {
+        {
+            #input_func
+
+            let ctx = crate::common::LiveTestContext::new(
+                #func_ident_string
+            ).await.expect("setting up LiveTestContext");
+            #func_ident(&ctx).await;
+            ctx.cleanup_successful();
+        }
+    };
+    let mut sig = input_func.sig.clone();
+    sig.inputs.clear();
+    let func = ItemFn {
+        attrs: input_func.attrs,
+        vis: input_func.vis,
+        sig,
+        block: Box::new(syn::parse2(new_block).unwrap()),
+    };
+    TokenStream::from(quote!(
+    #[::tokio::test]
+    #func
+    ))
+}
diff --git a/live-tests/macros/src/main.rs b/live-tests/macros/src/main.rs
new file mode 100644
index 0000000000..e7a11a969c
--- /dev/null
+++ b/live-tests/macros/src/main.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("Hello, world!");
+}
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
index 47615fcc65..ddb538ecae 100644
--- a/live-tests/tests/test_nexus_add_remove.rs
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -22,10 +22,10 @@ use omicron_test_utils::dev::poll::CondCheckError;
 use slog::{debug, info, o};
 use std::net::SocketAddrV6;
 use std::time::Duration;
+use live_tests_macros::live_test;
 
 // XXX-dap clean up in general
 // XXX-dap clean up logging
-// XXX-dap make a macro for LiveTestContext
 // TODO-coverage This test could check other stuff:
 //
 // - that after adding:
@@ -38,10 +38,9 @@ use std::time::Duration;
 //   - we can't reach it any more on the underlay
 //   - it doesn't appear in external DNS any more
 //
-#[tokio::test]
-async fn test_nexus_add_remove() {
+#[live_test]
+async fn test_nexus_add_remove(lc: &LiveTestContext) {
     // Test setup
-    let lc = LiveTestContext::new("test_nexus_add_remove").await.unwrap();
     let log = lc.log();
     let mylog = lc.log().new(o!("top-level" => true));
     let nexus =
@@ -197,8 +196,6 @@ async fn test_nexus_add_remove() {
 
     assert_eq!(found.id, saga_id);
     assert!(matches!(found.state, SagaState::Succeeded));
-
-    lc.cleanup_successful();
 }
 
 async fn list_sagas(

From aed5ea257f114227c1d7e01206409e25b95b634d Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 14:00:15 -0700
Subject: [PATCH 13/24] fix bugs, clean up

---
 live-tests/tests/common/mod.rs            | 13 --------
 live-tests/tests/test_nexus_add_remove.rs | 37 +++++++++++------------
 2 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index 787161a692..f1f17be6ca 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -68,19 +68,6 @@ impl LiveTestContext {
         &self.datastore
     }
 
-    /// Returns a client for any Nexus's internal API (based on the Nexus
-    /// instances that are in DNS)
-    pub async fn any_internal_nexus_client(
-        &self,
-    ) -> Result<nexus_client::Client, anyhow::Error> {
-        let sockaddr = self
-            .resolver
-            .lookup_socket_v6(ServiceName::Nexus)
-            .await
-            .context("looking up Nexus in internal DNS")?;
-        Ok(self.specific_internal_nexus_client(sockaddr))
-    }
-
     /// Returns a client for a Nexus internal API at the given socket address
     pub fn specific_internal_nexus_client(
         &self,
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
index ddb538ecae..e7cd13a634 100644
--- a/live-tests/tests/test_nexus_add_remove.rs
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -9,6 +9,7 @@ use assert_matches::assert_matches;
 use common::reconfigurator::blueprint_edit_current_target;
 use common::LiveTestContext;
 use futures::TryStreamExt;
+use live_tests_macros::live_test;
 use nexus_client::types::Saga;
 use nexus_client::types::SagaState;
 use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
@@ -19,10 +20,9 @@ use nexus_types::deployment::SledFilter;
 use omicron_common::address::NEXUS_INTERNAL_PORT;
 use omicron_test_utils::dev::poll::wait_for_condition;
 use omicron_test_utils::dev::poll::CondCheckError;
-use slog::{debug, info, o};
+use slog::{debug, info};
 use std::net::SocketAddrV6;
 use std::time::Duration;
-use live_tests_macros::live_test;
 
 // XXX-dap clean up in general
 // XXX-dap clean up logging
@@ -42,14 +42,14 @@ use live_tests_macros::live_test;
 async fn test_nexus_add_remove(lc: &LiveTestContext) {
     // Test setup
     let log = lc.log();
-    let mylog = lc.log().new(o!("top-level" => true));
-    let nexus =
-        lc.any_internal_nexus_client().await.expect("internal Nexus client");
     let opctx = lc.opctx();
     let datastore = lc.datastore();
     let planning_input = PlanningInputFromDb::assemble(&opctx, &datastore)
         .await
         .expect("planning input");
+    let initial_nexus_clients = lc.all_internal_nexus_clients().await.unwrap();
+    let nexus =
+        initial_nexus_clients.iter().next().expect("internal Nexus client");
 
     // First, deploy a new Nexus zone to an arbitrary sled.
     let sled_id = planning_input
@@ -109,7 +109,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
     .await
     .expect("new Nexus to be usable");
     assert!(initial_sagas_list.is_empty());
-    info!(mylog, "new Nexus is online");
+    info!(log, "new Nexus is online");
 
     // Create a demo saga from the new Nexus zone.  We'll use this to test that
     // when the zone is expunged, its saga gets moved to a different Nexus.
@@ -122,9 +122,9 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
         list_sagas(&new_zone_client).await.expect("new Nexus sagas_list");
     assert_eq!(sagas_list.len(), 1);
     assert_eq!(sagas_list[0].id, saga_id);
+    info!(log, "created demo saga"; "demo_saga" => ?demo_saga);
 
     // Now expunge the zone we just created.
-    info!(mylog, "ready to expunge");
     let _ = blueprint_edit_current_target(
         log,
         &planning_input,
@@ -138,24 +138,16 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
     .await
     .expect("editing blueprint to expunge zone");
 
-    // XXX-dap first, wait for the expunged instance to actually go away.
-    // Otherwise, it's possible we find that one in
-    // `all_internal_nexus_clients()` and complete the saga before the zone gets
-    // expunged!
-    //
-    // XXX-dap can we check the URL on the client to make sure it's a different
-    // one?
-
     // Wait for some other Nexus instance to pick up the saga.
-    let nexus_clients = lc.all_internal_nexus_clients().await.unwrap();
     let nexus_found = wait_for_condition(
         || async {
-            for nexus_client in &nexus_clients {
+            for nexus_client in &initial_nexus_clients {
+                assert!(nexus_client.baseurl() != new_zone_client.baseurl());
                 let Ok(sagas) = list_sagas(&nexus_client).await else {
                     continue;
                 };
 
-                debug!(mylog, "found sagas (last): {:?}", sagas);
+                debug!(log, "found sagas (last): {:?}", sagas);
                 if sagas.into_iter().any(|s| s.id == saga_id) {
                     return Ok(nexus_client);
                 }
@@ -168,7 +160,12 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
     )
     .await
     .unwrap();
-    info!(mylog, "found saga in a different Nexus instance");
+
+    info!(log, "found saga in a different Nexus instance";
+        "saga_id" => %saga_id,
+        "found_nexus" => nexus_found.baseurl(),
+    );
+    assert!(nexus_found.baseurl() != new_zone_client.baseurl());
 
     // Now, complete the demo saga on whichever instance is running it now.
     // `saga_demo_complete` is not synchronous.  It just unblocks the saga.
@@ -180,7 +177,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
     let found = wait_for_condition(
         || async {
             let sagas = list_sagas(&nexus_found).await.expect("listing sagas");
-            debug!(mylog, "found sagas (last): {:?}", sagas);
+            debug!(log, "found sagas (last): {:?}", sagas);
             let found = sagas.into_iter().find(|s| s.id == saga_id).unwrap();
             if matches!(found.state, SagaState::Succeeded) {
                 Ok(found)

From 9d959dbd7ce6921135cbeaefdc9ea18688c6b776 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 14:09:01 -0700
Subject: [PATCH 14/24] test that the zone goes away

---
 live-tests/tests/test_nexus_add_remove.rs | 34 +++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
index e7cd13a634..d85f6a21cf 100644
--- a/live-tests/tests/test_nexus_add_remove.rs
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -24,8 +24,6 @@ use slog::{debug, info};
 use std::net::SocketAddrV6;
 use std::time::Duration;
 
-// XXX-dap clean up in general
-// XXX-dap clean up logging
 // TODO-coverage This test could check other stuff:
 //
 // - that after adding:
@@ -34,9 +32,7 @@ use std::time::Duration;
 //     (e.g., using an `oxide_client` using a custom reqwest resolver that
 //     points only at that one IP so that we can make sure we're always getting
 //     that one)
-// - that after expungement:
-//   - we can't reach it any more on the underlay
-//   - it doesn't appear in external DNS any more
+// - that after expungement, it doesn't appear in external DNS any more
 //
 #[live_test]
 async fn test_nexus_add_remove(lc: &LiveTestContext) {
@@ -138,6 +134,34 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
     .await
     .expect("editing blueprint to expunge zone");
 
+    // At some point, we should be unable to reach this Nexus any more.
+    wait_for_condition(
+        || async {
+            match new_zone_client.saga_list(None, None, None).await {
+                Err(nexus_client::Error::CommunicationError(error)) => {
+                    info!(log, "expunged Nexus no longer reachable";
+                        "error" => slog_error_chain::InlineErrorChain::new(&error),
+                    );
+                    Ok(())
+                }
+                Ok(_) => {
+                    debug!(log, "expunged Nexus is still reachable");
+                    Err(CondCheckError::<()>::NotYet)
+                }
+                Err(error) => {
+                    debug!(log, "expunged Nexus is still reachable";
+                        "error" => slog_error_chain::InlineErrorChain::new(&error),
+                    );
+                    Err(CondCheckError::NotYet)
+                }
+            }
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(60),
+    )
+    .await
+    .unwrap();
+
     // Wait for some other Nexus instance to pick up the saga.
     let nexus_found = wait_for_condition(
         || async {

From f0515949db72116c5dda021c9b1cbc883a4dbba6 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 14:21:36 -0700
Subject: [PATCH 15/24] live-test -> live-tests

---
 .../jobs/{build-live-test.sh => build-live-tests.sh}   |  4 ++--
 dev-tools/xtask/src/{live_test.rs => live_tests.rs}    | 10 +++++-----
 dev-tools/xtask/src/main.rs                            |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)
 rename .github/buildomat/jobs/{build-live-test.sh => build-live-tests.sh} (89%)
 rename dev-tools/xtask/src/{live_test.rs => live_tests.rs} (95%)

diff --git a/.github/buildomat/jobs/build-live-test.sh b/.github/buildomat/jobs/build-live-tests.sh
similarity index 89%
rename from .github/buildomat/jobs/build-live-test.sh
rename to .github/buildomat/jobs/build-live-tests.sh
index df3b2712b5..c60c1048ba 100755
--- a/.github/buildomat/jobs/build-live-test.sh
+++ b/.github/buildomat/jobs/build-live-tests.sh
@@ -23,5 +23,5 @@ source ./env.sh
 banner prerequisites
 ptime -m bash ./tools/install_builder_prerequisites.sh -y
 
-banner live-test
-ptime -m cargo xtask live-test
+banner live-tests
+ptime -m cargo xtask live-tests
diff --git a/dev-tools/xtask/src/live_test.rs b/dev-tools/xtask/src/live_tests.rs
similarity index 95%
rename from dev-tools/xtask/src/live_test.rs
rename to dev-tools/xtask/src/live_tests.rs
index d334ff91bb..35d40343ee 100644
--- a/dev-tools/xtask/src/live_test.rs
+++ b/dev-tools/xtask/src/live_tests.rs
@@ -2,7 +2,7 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
-//! Subcommand: cargo xtask live-test
+//! Subcommand: cargo xtask live-tests
 
 use crate::common::run_subcmd;
 use anyhow::{bail, Context, Result};
@@ -13,7 +13,7 @@ use std::process::Command;
 pub struct Args {}
 
 pub fn run_cmd(_args: Args) -> Result<()> {
-    const NAME: &str = "live-test-archive";
+    const NAME: &str = "live-tests-archive";
 
     // The live tests operate in deployed environments, which always run
     // illumos.  Bail out quickly if someone tries to run this on a system whose
@@ -22,7 +22,7 @@ pub fn run_cmd(_args: Args) -> Result<()> {
     // silently missing something you might expect to be there.  Plus, you can
     // still check and even build *this* code on non-illumos systems.)
     if cfg!(not(target_os = "illumos")) {
-        bail!("live-test archive can only be built on illumos systems");
+        bail!("live-tests archive can only be built on illumos systems");
     }
 
     let tmpdir_root =
@@ -71,8 +71,8 @@ pub fn run_cmd(_args: Args) -> Result<()> {
     // plus the nextext archive file.
     //
     // To avoid creating a tarbomb, we want all the files prefixed with
-    // "live-test-archive/".  There's no great way to do this with the illumos
-    // tar(1) except to create a temporary directory called "live-test-archive"
+    // "live-tests-archive/".  There's no great way to do this with the illumos
+    // tar(1) except to create a temporary directory called "live-tests-archive"
     // that contains the files and then tar'ing up that.
     //
     // Ironically, an easy way to construct that directory is with tar(1).
diff --git a/dev-tools/xtask/src/main.rs b/dev-tools/xtask/src/main.rs
index ffd1d48b29..9880adeb67 100644
--- a/dev-tools/xtask/src/main.rs
+++ b/dev-tools/xtask/src/main.rs
@@ -19,7 +19,7 @@ mod clippy;
 mod common;
 #[cfg_attr(not(target_os = "illumos"), allow(dead_code))]
 mod external;
-mod live_test;
+mod live_tests;
 mod usdt;
 
 #[cfg(target_os = "illumos")]
@@ -62,7 +62,7 @@ enum Cmds {
     Download(external::External),
 
     /// Create a bundle of live tests
-    LiveTest(live_test::Args),
+    LiveTests(live_tests::Args),
 
     /// Utilities for working with MGS.
     MgsDev(external::External),
@@ -132,7 +132,7 @@ fn main() -> Result<()> {
                 external.exec_bin("xtask-downloader")
             }
         }
-        Cmds::LiveTest(args) => live_test::run_cmd(args),
+        Cmds::LiveTests(args) => live_tests::run_cmd(args),
         Cmds::MgsDev(external) => external.exec_bin("mgs-dev"),
         Cmds::OmicronDev(external) => external.exec_bin("omicron-dev"),
         Cmds::Openapi(external) => external.exec_bin("openapi-manager"),

From 6dbf1ff9cebfb416e14ec87ca512be0459a25971 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 15:02:36 -0700
Subject: [PATCH 16/24] add some docs

---
 README.adoc                  |  2 +
 end-to-end-tests/README.adoc |  2 +
 live-tests/README.adoc       | 78 ++++++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 live-tests/README.adoc

diff --git a/README.adoc b/README.adoc
index d48a5c9736..c2473a7a07 100644
--- a/README.adoc
+++ b/README.adoc
@@ -62,6 +62,8 @@ Nextest https://github.com/nextest-rs/nextest/issues/16[does not support doctest
 
 Similarly, you can run tests inside a https://github.com/oxidecomputer/falcon[Falcon] based VM. This is described in the `test-utils` https://github.com/oxidecomputer/omicron/tree/main/test-utils[README].
 
+There's also a xref:./live-tests/README.adoc[`live-tests`] test suite that can be run by hand in a _deployed_ Omicron system.
+
 === rustfmt and clippy
 
 You can **format the code** using `cargo fmt`.  Make sure to run this before pushing changes.  The CI checks that the code is correctly formatted.
diff --git a/end-to-end-tests/README.adoc b/end-to-end-tests/README.adoc
index b9766db809..3e31f2b382 100644
--- a/end-to-end-tests/README.adoc
+++ b/end-to-end-tests/README.adoc
@@ -4,6 +4,8 @@ These tests run in Buildomat. They are built by the xref:../.github/buildomat/jo
 
 This package is not built or run by default (it is excluded from `default-members` in xref:../Cargo.toml[]).
 
+See also: xref:../live-tests/README.adoc[omicron-live-tests].
+
 == Running these tests on your machine
 
 1. xref:../docs/how-to-run.adoc[Make yourself a Gimlet].
diff --git a/live-tests/README.adoc b/live-tests/README.adoc
new file mode 100644
index 0000000000..9ea3a93e48
--- /dev/null
+++ b/live-tests/README.adoc
@@ -0,0 +1,78 @@
+= Omicron live tests
+
+The `omicron-live-tests` package contains automated tests that operate in the context of an already-deployed "real" Oxide system (e.g., `a4x2` or our `london` or `madrid` test environments).  This is a home for automated tests for all kinds of Reconfigurator behavior (e.g., add/expunge of all zones, add/expunge sled, upgrades, etc.).  It can probably be used for non-Reconfigurator behavior, too.
+
+This package is not built or tested by default because the tests generally can't work in a dev environment and there's no way to have `cargo` build and check them but not run the tests by default.
+
+== Why a separate test suite?
+
+What makes these tests different from the rest of the test suite is that they require connectivity to the underlay network of the deployed system and they make API calls to various components in that system and they assume that this will behave like a real production system.  By contrast, the normal tests instead _set up_ a bunch of components using simulated sled agents and localhost networking, which is great for starting from a predictable state and running tests in parallel, but the simulated sled agents and networking make it impossible to exercise quite a lot of Reconfigurator's functionality.  
+
+There are also the `end-to-end-tests`.  That environment is more realistic than the main test suite, but not faithful enough for many Reconfigurator tests.
+
+== Production systems
+
+There are some safeguards so that these tests won't run on production systems: they refuse to run if they find any Oxide-hardware sleds in the system whose serial numbers don't correspond to known test environments.
+
+== Usage
+
+These tests are not currently run automatically (though they are _built_ in CI).
+
+You can run them yourself.  First, deploy Omicron using `a4x2` or one of the hardware test rigs.  In your Omicron workspace, run `cargo xtask live-tests` to build an archive and then follow the instructions:
+
+```
+$ cargo xtask live-tests
+    Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.96s
+     Running `target/debug/xtask live-tests`
+using temporary directory: /dangerzone/omicron_tmp/.tmp0ItZUD
+will create archive file:  /dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst
+output tarball:            /home/dap/omicron-work/target/live-tests-archive.tgz
+
+running: /home/dap/.rustup/toolchains/1.80.1-x86_64-unknown-illumos/bin/cargo "nextest" "archive" "--package" "omicron-live-tests" "--archive-file" "/dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst"
+    Finished `test` profile [unoptimized + debuginfo] target(s) in 0.89s
+info: experimental features enabled: setup-scripts
+   Archiving 1 binary, 1 build script output directory, and 1 linked path to /dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst
+    Archived 35 files to /dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst in 0.31s
+running: bash "-c" "tar cf - Cargo.toml .config/nextest.toml live-tests | tar xf - -C \"/dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive\""
+running: tar "cf" "/home/dap/omicron-work/target/live-tests-archive.tgz" "-C" "/dangerzone/omicron_tmp/.tmp0ItZUD" "live-tests-archive"
+created: /home/dap/omicron-work/target/live-tests-archive.tgz
+
+To use this:
+
+1. Copy the tarball to the switch zone in a deployed Omicron system.
+
+     e.g., scp \
+              /home/dap/omicron-work/target/live-tests-archive.tgz \
+              root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root
+
+2. Copy the `cargo-nextest` binary to the same place.
+
+     e.g., scp \
+              $(which cargo-nextest) \
+              root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root
+
+3. On that system, unpack the tarball with:
+
+     tar xzf live-tests-archive.tgz
+
+4. On that system, run tests with:
+
+     TMPDIR=/var/tmp ./cargo-nextest nextest run \
+         --archive-file live-tests-archive/omicron-live-tests.tar.zst \
+         --workspace-remap live-tests-archive
+```
+
+Follow the instructions, run the tests, and you'll see the usual `nextest`-style output:
+
+```
+root@oxz_switch:~# TMPDIR=/var/tmp ./cargo-nextest nextest run          --archive-file live-tests-archive/omicron-live-tests.tar.zst          --workspace-remap live-tests-archive
+  Extracting 1 binary, 1 build script output directory, and 1 linked path to /var/tmp/nextest-archive-Lqx9VZ
+   Extracted 35 files to /var/tmp/nextest-archive-Lqx9VZ in 1.01s
+info: experimental features enabled: setup-scripts
+    Starting 1 test across 1 binary (run ID: a5fc9163-9dd5-4b23-b89f-55f8f39ebbbc, nextest profile: default)
+        SLOW [> 60.000s] omicron-live-tests::test_nexus_add_remove test_nexus_add_remove
+        PASS [  61.975s] omicron-live-tests::test_nexus_add_remove test_nexus_add_remove
+------------
+     Summary [  61.983s] 1 test run: 1 passed (1 slow), 0 skipped
+root@oxz_switch:~# 
+```

From cf8fb0e2f0cc6a7e3a25990e3ebc40d178c1e119 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 15:12:11 -0700
Subject: [PATCH 17/24] fix clippy

---
 live-tests/macros/src/main.rs             | 3 ---
 live-tests/tests/test_nexus_add_remove.rs | 3 +--
 2 files changed, 1 insertion(+), 5 deletions(-)
 delete mode 100644 live-tests/macros/src/main.rs

diff --git a/live-tests/macros/src/main.rs b/live-tests/macros/src/main.rs
deleted file mode 100644
index e7a11a969c..0000000000
--- a/live-tests/macros/src/main.rs
+++ /dev/null
@@ -1,3 +0,0 @@
-fn main() {
-    println!("Hello, world!");
-}
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
index d85f6a21cf..70e55b704a 100644
--- a/live-tests/tests/test_nexus_add_remove.rs
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -44,8 +44,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) {
         .await
         .expect("planning input");
     let initial_nexus_clients = lc.all_internal_nexus_clients().await.unwrap();
-    let nexus =
-        initial_nexus_clients.iter().next().expect("internal Nexus client");
+    let nexus = initial_nexus_clients.first().expect("internal Nexus client");
 
     // First, deploy a new Nexus zone to an arbitrary sled.
     let sled_id = planning_input

From a9d0a3205132864f0af1755ea79073e684054bcc Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 15:16:16 -0700
Subject: [PATCH 18/24] fix workspace dep check

---
 dev-tools/xtask/src/check_workspace_deps.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev-tools/xtask/src/check_workspace_deps.rs b/dev-tools/xtask/src/check_workspace_deps.rs
index 73d5643ffb..92e83d8724 100644
--- a/dev-tools/xtask/src/check_workspace_deps.rs
+++ b/dev-tools/xtask/src/check_workspace_deps.rs
@@ -128,6 +128,7 @@ pub fn run_cmd() -> Result<()> {
                 // The tests here should not be run by default, as they require
                 // a running control plane.
                 "end-to-end-tests",
+                "omicron-live-tests",
             ]
             .contains(&package.name.as_str())
             .then_some(&package.id)

From 80922f6ad45bb2fc63a6f06ae2b7ea23ff0044b1 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 15:19:16 -0700
Subject: [PATCH 19/24] fix hakari

---
 Cargo.lock            | 1 +
 live-tests/Cargo.toml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 4ec7c77600..9f20a94b85 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6027,6 +6027,7 @@ dependencies = [
  "omicron-common",
  "omicron-rpaths",
  "omicron-test-utils",
+ "omicron-workspace-hack",
  "pq-sys",
  "reqwest",
  "serde",
diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml
index a68a492703..e0eaf2c338 100644
--- a/live-tests/Cargo.toml
+++ b/live-tests/Cargo.toml
@@ -10,6 +10,7 @@ omicron-rpaths.workspace = true
 [dependencies]
 # See omicron-rpaths for more about the "pq-sys" dependency.
 pq-sys = "*"
+omicron-workspace-hack.workspace = true
 
 [dev-dependencies]
 anyhow.workspace = true

From 9288eb0ad4998a82a8e1f8dabfbb5b1483c7c4a2 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 15:46:14 -0700
Subject: [PATCH 20/24] fix live-tests check (move this into the existing check
 that already has nextest)

---
 .github/buildomat/build-and-test.sh        |  3 +++
 .github/buildomat/jobs/build-live-tests.sh | 27 ----------------------
 2 files changed, 3 insertions(+), 27 deletions(-)
 delete mode 100755 .github/buildomat/jobs/build-live-tests.sh

diff --git a/.github/buildomat/build-and-test.sh b/.github/buildomat/build-and-test.sh
index 1e4b655cb9..135ae7d3ea 100755
--- a/.github/buildomat/build-and-test.sh
+++ b/.github/buildomat/build-and-test.sh
@@ -89,6 +89,9 @@ ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose
 banner doctest
 ptime -m timeout 1h cargo test --doc --locked --verbose --no-fail-fast
 
+# Build the live-tests.  (We can't actually run them here.  See the README.)
+ptime -m cargo xtask live-tests
+
 # We expect the seed CRDB to be placed here, so we explicitly remove it so the
 # rmdir check below doesn't get triggered. Nextest doesn't have support for
 # teardown scripts so this is the best we've got.
diff --git a/.github/buildomat/jobs/build-live-tests.sh b/.github/buildomat/jobs/build-live-tests.sh
deleted file mode 100755
index c60c1048ba..0000000000
--- a/.github/buildomat/jobs/build-live-tests.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-#:
-#: name = "build-live-tests"
-#: variety = "basic"
-#: target = "helios-2.0"
-#: rust_toolchain = true
-#: output_rules = []
-
-# Builds the "live-tests" nextest archive
-
-set -o errexit
-set -o pipefail
-set -o xtrace
-
-cargo --version
-rustc --version
-
-#
-# Set up our PATH for use with this workspace.
-#
-source ./env.sh
-
-banner prerequisites
-ptime -m bash ./tools/install_builder_prerequisites.sh -y
-
-banner live-tests
-ptime -m cargo xtask live-tests

From 9bbb2730af59c78bce2494e1a875891e5677f187 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Fri, 23 Aug 2024 19:45:20 -0700
Subject: [PATCH 21/24] cannot build that on non-illumos

---
 .github/buildomat/build-and-test.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/buildomat/build-and-test.sh b/.github/buildomat/build-and-test.sh
index 135ae7d3ea..50bbd8194d 100755
--- a/.github/buildomat/build-and-test.sh
+++ b/.github/buildomat/build-and-test.sh
@@ -89,8 +89,11 @@ ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose
 banner doctest
 ptime -m timeout 1h cargo test --doc --locked --verbose --no-fail-fast
 
-# Build the live-tests.  (We can't actually run them here.  See the README.)
-ptime -m cargo xtask live-tests
+# Build the live-tests.  This is only supported on illumos.
+# We also can't actually run them here.  See the README for more details.
+if [[ $target_os == "illumos" ]]; then
+    ptime -m cargo xtask live-tests
+fi
 
 # We expect the seed CRDB to be placed here, so we explicitly remove it so the
 # rmdir check below doesn't get triggered. Nextest doesn't have support for

From c7cd0ed639545de63377630eda53e0e2111245a6 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Mon, 26 Aug 2024 14:03:27 -0700
Subject: [PATCH 22/24] review feedback

---
 .config/nextest.toml         |  2 ++
 live-tests/macros/src/lib.rs | 14 ++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.config/nextest.toml b/.config/nextest.toml
index a294bcc18b..67cabe9f0e 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -10,6 +10,8 @@ experimental = ["setup-scripts"]
 [[profile.default.scripts]]
 # Exclude omicron-dev tests from crdb-seed as we explicitly want to simulate an
 # environment where the seed file doesn't exist.
+# Exclude omicron-live-tests because those don't need this and also don't have
+# it available in the environment in which they run.
 filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)'
 setup = 'crdb-seed'
 
diff --git a/live-tests/macros/src/lib.rs b/live-tests/macros/src/lib.rs
index 49abb5b6cf..4fdd4029b5 100644
--- a/live-tests/macros/src/lib.rs
+++ b/live-tests/macros/src/lib.rs
@@ -9,15 +9,21 @@ use proc_macro::TokenStream;
 use quote::quote;
 use syn::{parse_macro_input, ItemFn};
 
-/// Attribute for wrapping a test function to handle automatically creating and
-/// destroying a LiveTestContext. If the wrapped test fails, the context will
-/// intentionally not be cleaned up to support debugging.
+/// Define a test function that uses `LiveTestContext`
+///
+/// This is usable only within the `omicron-live-tests` crate.
+///
+/// Similar to `nexus_test`, this macro lets you define a test function that
+/// behaves like `tokio::test` except that it accepts an argument of type
+/// `&LiveTestContext`.  The `LiveTestContext` is cleaned up on _successful_
+/// return of the test function.  On failure, debugging information is
+/// deliberately left around.
 ///
 /// Example usage:
 ///
 /// ```ignore
 /// #[live_test]
-/// async fn test_my_test_case(lc: &mut LiveTestContext) {
+/// async fn test_my_test_case(lc: &LiveTestContext) {
 ///   assert!(true);
 /// }
 /// ```

From ab5448b57d9266ac247f46c69bbcdc29ca80ae1c Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Tue, 27 Aug 2024 15:38:30 -0700
Subject: [PATCH 23/24] fix semantic mismerge

---
 live-tests/tests/common/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index f1f17be6ca..7091232397 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -133,7 +133,7 @@ async fn create_datastore(
     .context("failed to parse constructed postgres URL")?;
 
     let db_config = nexus_db_queries::db::Config { url };
-    let pool = Arc::new(nexus_db_queries::db::Pool::new(log, &db_config));
+    let pool = Arc::new(nexus_db_queries::db::Pool::new_single_host(log, &db_config));
     DataStore::new_failfast(log, pool)
         .await
         .context("creating DataStore")

From 665bcc17746e2c722c565bbc0b888c95e7649786 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Tue, 27 Aug 2024 15:43:14 -0700
Subject: [PATCH 24/24] rustfmt

---
 live-tests/tests/common/mod.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
index 7091232397..28f677f5ed 100644
--- a/live-tests/tests/common/mod.rs
+++ b/live-tests/tests/common/mod.rs
@@ -133,7 +133,8 @@ async fn create_datastore(
     .context("failed to parse constructed postgres URL")?;
 
     let db_config = nexus_db_queries::db::Config { url };
-    let pool = Arc::new(nexus_db_queries::db::Pool::new_single_host(log, &db_config));
+    let pool =
+        Arc::new(nexus_db_queries::db::Pool::new_single_host(log, &db_config));
     DataStore::new_failfast(log, pool)
         .await
         .context("creating DataStore")