From 2eef2e2297ae89cf1317dc465db7183631683a4b Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 21 May 2026 21:03:37 +1200 Subject: [PATCH 1/3] [update] add missing sleds to contact_support checks --- nexus/src/app/update.rs | 366 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 350 insertions(+), 16 deletions(-) diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index 5535c33a34e..6f633a1c5f4 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -103,6 +103,7 @@ struct UpdateContactSupportChecksInput { blueprint: Arc, // None when no target release has ever been set on the system. current_target_version: Option, + internal_update_status: internal_views::UpdateStatus, } impl UpdateContactSupportChecksInput { @@ -120,6 +121,19 @@ impl UpdateContactSupportChecksInput { } }; + let missing_sleds: BTreeSet = self + .internal_update_status + .sleds + .iter() + .filter(|sled| { + **sled + == internal_views::SledAgentUpdateStatus::unknown( + sled.sled_id, + ) + }) + .map(|sled| sled.sled_id) + .collect(); + let (stuck_sagas, stuck_sagas_error_message) = match &self.stuck_sagas { Ok(sagas) => ( sagas @@ -162,6 +176,7 @@ impl UpdateContactSupportChecksInput { stale_inventory_last_collection_time_done, unhealthy_zpools_by_sled, enabled_smf_services_not_online_by_sled, + missing_sleds, } } } @@ -192,6 +207,7 @@ struct UpdateStatusProblems { /// Enabled SMF services that are not in an `online` state. enabled_smf_services_not_online_by_sled: BTreeMap, + missing_sleds: BTreeSet, } impl UpdateStatusProblems { @@ -203,6 +219,7 @@ impl UpdateStatusProblems { stale_inventory_last_collection_time_done, unhealthy_zpools_by_sled, enabled_smf_services_not_online_by_sled, + missing_sleds, } = self; stuck_sagas.is_empty() && stuck_sagas_error_message.is_none() @@ -210,6 +227,7 @@ impl UpdateStatusProblems { && stale_inventory_last_collection_time_done.is_none() && unhealthy_zpools_by_sled.is_empty() && enabled_smf_services_not_online_by_sled.is_empty() + && missing_sleds.is_empty() } } @@ -228,6 +246,7 @@ impl KV for UpdateStatusProblems { stale_inventory_last_collection_time_done, unhealthy_zpools_by_sled, enabled_smf_services_not_online_by_sled, + missing_sleds, } = self; if !stuck_sagas.is_empty() { @@ -270,6 +289,12 @@ impl KV for UpdateStatusProblems { &format_args!("{:?}", enabled_smf_services_not_online_by_sled), )?; } + if !missing_sleds.is_empty() { + serializer.emit_arguments( + "missing_sleds".into(), + &format_args!("{:?}", missing_sleds), + )?; + } Ok(()) } } @@ -507,8 +532,8 @@ impl super::Nexus { return Err(Error::internal_error("No inventory collection found")); }; - let components_by_release_version = self - .component_version_counts( + let internal_status = self + .get_internal_update_status( opctx, &db_target_release, current_tuf_repo, @@ -516,6 +541,17 @@ impl super::Nexus { ) .await?; + let components_by_release_version = self + .component_version_counts( + // opctx, + // &db_target_release, + // current_tuf_repo, + // &inventory, + // TODO-K: Is this clone fine? Should I just use a reference? + internal_status.clone(), + ) + .await?; + let blueprint_target = self .update_status .latest_blueprint @@ -543,6 +579,7 @@ impl super::Nexus { inventory, Arc::clone(&blueprint_target.blueprint), target_release.as_ref().map(|t| &t.version), + internal_status, ) .await?; @@ -574,6 +611,7 @@ impl super::Nexus { inventory: Arc, blueprint: Arc, current_target_version: Option<&Version>, + internal_update_status: internal_views::UpdateStatus, ) -> Result { // If an update is in progress but not stuck, the remaining checks // could fail mid-update and shouldn't trigger a contact-support @@ -591,6 +629,8 @@ impl super::Nexus { UpdateActivityState::Idle | UpdateActivityState::Stuck => {} }; + // TODO-K: Add missing sleds here + let stuck_sagas = self .datastore() .saga_list_running_or_unwinding_older_than( @@ -604,6 +644,7 @@ impl super::Nexus { stuck_sagas, blueprint, current_target_version: current_target_version.cloned(), + internal_update_status, }; let problems = checks.problems(); @@ -620,15 +661,13 @@ impl super::Nexus { Ok(contact_support) } - /// Build a map of version strings to the number of components on that - /// version - async fn component_version_counts( + async fn get_internal_update_status( &self, opctx: &OpContext, target_release: &nexus_db_model::TargetRelease, current_tuf_repo: Option, inventory: &Arc, - ) -> Result, Error> { + ) -> Result { // Build current TargetReleaseDescription, defaulting to Initial if // there is no tuf repo ID which, based on DB constraints, happens if // and only if target_release_source is 'unspecified', which should only @@ -709,6 +748,99 @@ impl super::Nexus { &inventory, ); + Ok(status) + } + + /// Build a map of version strings to the number of components on that + /// version + async fn component_version_counts( + &self, + // opctx: &OpContext, + // target_release: &nexus_db_model::TargetRelease, + // current_tuf_repo: Option, + // inventory: &Arc, + status: internal_views::UpdateStatus, + ) -> Result, Error> { + // // Build current TargetReleaseDescription, defaulting to Initial if + // // there is no tuf repo ID which, based on DB constraints, happens if + // // and only if target_release_source is 'unspecified', which should only + // // happen in the initial state before any target release has been set + // let curr_target_desc = match current_tuf_repo { + // Some(repo) => { + // TargetReleaseDescription::TufRepo(repo.into_external()) + // } + // None => TargetReleaseDescription::Initial, + // }; + // + // // Get previous target release (if it exists). Build the "prev" + // // TargetReleaseDescription from the previous generation if available, + // // otherwise fall back to Initial. + // let prev_repo_id = + // if let Some(prev_gen) = target_release.generation.prev() { + // self.datastore() + // .target_release_get_generation(opctx, Generation(prev_gen)) + // .await + // .internal_context("fetching previous target release")? + // .and_then(|r| r.tuf_repo_id) + // } else { + // None + // }; + // + // // It should never happen that a target release other than the initial + // // one with target_release_source unspecified should be missing a + // // tuf_repo_id. So if we have a tuf_repo_id for the previous target + // // release, we should always have one for the current target. + // if prev_repo_id.is_some() && target_release.tuf_repo_id.is_none() { + // return Err(Error::internal_error( + // "Target release has no tuf repo but previous release has one", + // )); + // } + // + // let prev_target_desc = match prev_repo_id { + // Some(id) => TargetReleaseDescription::TufRepo( + // self.datastore() + // .tuf_repo_get_by_id(opctx, id.into()) + // .await? + // .into_external(), + // ), + // None => TargetReleaseDescription::Initial, + // }; + // + // // Get the list of sleds that should be reported as a part of the update + // // status. (In particular, this allows us to filter out sleds that are + // // physically present but not part of the cluster, as well as add + // // "unknown" counts for sleds that ought to be present but aren't.) + // let expected_sleds = self + // .datastore() + // .sled_list_all_batched( + // opctx, + // SledFilter::SpsUpdatedByReconfigurator, + // ) + // .await? + // .iter() + // .map(|sled| { + // ( + // BaseboardId { + // part_number: sled.part_number().to_string(), + // serial_number: sled.serial_number().to_string(), + // }, + // sled.id(), + // ) + // }) + // .collect(); + // + // // It's weird to use the internal view this way. It would feel more + // // correct to extract shared logic and call it in both places. On the + // // other hand, that sharing would be boilerplatey and not add much yet. + // // So for now, use the internal view, but plan to extract shared logic + // // or do our own thing here once things settle. + // let status = internal_views::UpdateStatus::new( + // &prev_target_desc, + // &curr_target_desc, + // &expected_sleds, + // &inventory, + // ); + let sled_versions = status.sleds.into_iter().flat_map(|sled| { let zone_versions = sled.zones.into_iter().map(|zone| zone.version); @@ -1033,6 +1165,58 @@ mod test { problems } + // Build an empty `internal_views::UpdateStatus` for tests that don't care + // about the missing-sleds check. + fn empty_internal_update_status() -> internal_views::UpdateStatus { + internal_views::UpdateStatus { + mgs_driven: iddqd::IdOrdMap::new(), + sleds: iddqd::IdOrdMap::new(), + } + } + + // Build a `SledAgentUpdateStatus` that looks "healthy" (i.e. has a known + // boot disk and known slot versions). + fn healthy_sled_agent_update_status( + sled_id: SledUuid, + ) -> internal_views::SledAgentUpdateStatus { + internal_views::SledAgentUpdateStatus { + sled_id, + zones: iddqd::IdOrdMap::new(), + host_phase_2: internal_views::HostPhase2Status { + boot_disk: Ok(omicron_common::disk::M2Slot::A), + slot_a_version: internal_views::TufRepoVersion::Version( + fake_target_version(), + ), + slot_b_version: internal_views::TufRepoVersion::Version( + fake_target_version(), + ), + }, + } + } + + // Build an `internal_views::UpdateStatus` whose `sleds` contains an + // "unknown" entry for each provided missing sled id, plus a healthy entry + // for each provided healthy sled id. + fn internal_update_status_with_missing_sleds( + missing_sled_ids: impl IntoIterator, + healthy_sled_ids: impl IntoIterator, + ) -> internal_views::UpdateStatus { + let mut sleds: iddqd::IdOrdMap = + missing_sled_ids + .into_iter() + .map(internal_views::SledAgentUpdateStatus::unknown) + .collect(); + for sled_id in healthy_sled_ids { + sleds + .insert_unique(healthy_sled_agent_update_status(sled_id)) + .expect("sled ids are unique"); + } + internal_views::UpdateStatus { + mgs_driven: iddqd::IdOrdMap::new(), + sleds, + } + } + #[nexus_test(server = crate::Server)] async fn test_contact_support_healthy_system( cptestctx: &ControlPlaneTestContext, @@ -1064,7 +1248,13 @@ mod test { // should be false assert!( !nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1098,7 +1288,13 @@ mod test { // should be true assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1132,7 +1328,13 @@ mod test { // support should be true assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1166,7 +1368,13 @@ mod test { fake_blueprint(&cptestctx.logctx.log, &version, Utc::now(), false); assert!( nexus - .contact_support(&opctx, inventory, blueprint, None,) + .contact_support( + &opctx, + inventory, + blueprint, + None, + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1201,7 +1409,13 @@ mod test { // support should be true assert!( nexus - .contact_support(&opctx, inventory, blueprint, None,) + .contact_support( + &opctx, + inventory, + blueprint, + None, + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1240,7 +1454,13 @@ mod test { // update is considered stuck and contact support is true. assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1285,11 +1505,21 @@ mod test { true, ); // Every health check is unhealthy: stuck saga, stuck update, stale - // inventory, unhealthy zpools, and unhealthy SMF services. Contact - // support should be true. + // inventory, unhealthy zpools, and unhealthy SMF services, plus a + // missing sled. Contact support should be true. + let missing_sled_id = SledUuid::new_v4(); assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + internal_update_status_with_missing_sleds( + [missing_sled_id], + [sled_id()], + ), + ) .await .unwrap() ); @@ -1324,7 +1554,57 @@ mod test { fake_blueprint(&cptestctx.logctx.log, &version, Utc::now(), true); assert!( !nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) + .await + .unwrap() + ); + } + + #[nexus_test(server = crate::Server)] + async fn test_contact_support_missing_sleds( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let opctx = fake_opctx(cptestctx); + insert_fake_collection( + cptestctx, + &opctx, + healthy_zpools(), + healthy_services(), + ) + .await; + let inventory = Arc::new( + nexus + .datastore() + .inventory_get_latest_collection(&opctx) + .await + .unwrap() + .unwrap(), + ); + let version = fake_target_version(); + let blueprint = + fake_blueprint(&cptestctx.logctx.log, &version, Utc::now(), false); + // The system is otherwise healthy, but a sled we expected to see in + // inventory is missing. Contact support should be true. + let missing_sled_id = SledUuid::new_v4(); + assert!( + nexus + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + internal_update_status_with_missing_sleds( + [missing_sled_id], + [sled_id()], + ), + ) .await .unwrap() ); @@ -1378,6 +1658,7 @@ mod test { inventory, blueprint, Some(&target_release), + empty_internal_update_status(), ) .await .unwrap() @@ -1403,6 +1684,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; assert_eq!(checks.problems(), UpdateStatusProblems::default()); @@ -1431,6 +1713,7 @@ mod test { stuck_sagas: Ok(vec![saga]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1463,6 +1746,7 @@ mod test { stuck_sagas: Err(err), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1495,6 +1779,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1529,6 +1814,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; assert_eq!(checks.problems(), UpdateStatusProblems::default()); @@ -1558,6 +1844,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1594,6 +1881,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected_zpool = Zpool { @@ -1636,6 +1924,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1677,6 +1966,7 @@ mod test { stuck_sagas: Ok(vec![saga]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected_zpool = Zpool { @@ -1728,11 +2018,16 @@ mod test { Utc::now() - STALE_INVENTORY_THRESHOLD - TimeDelta::seconds(10); collection.time_done = collection_time_done; + let missing_sled_id = SledUuid::new_v4(); let checks = UpdateContactSupportChecksInput { inventory: Arc::new(collection), stuck_sagas: Ok(vec![saga]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: internal_update_status_with_missing_sleds( + [missing_sled_id], + [sled_id], + ), }; let expected_zpool = Zpool { @@ -1757,9 +2052,48 @@ mod test { enabled_smf_services_not_online_by_sled: BTreeMap::from([( sled_id, services, )]), + missing_sleds: BTreeSet::from([missing_sled_id]), }; assert_eq!(normalise_zpool_times(checks.problems()), expected); logctx.cleanup_successful(); } + + #[test] + fn test_problems_missing_sleds() { + let logctx = test_setup_log("test_problems_missing_sleds"); + let blueprint = fake_blueprint( + &logctx.log, + &fake_target_version(), + Utc::now(), + false, + ); + + // Two sleds expected: one missing from inventory, one present and + // healthy. Only the missing sled should be picked up. + let missing_sled_id = SledUuid::new_v4(); + let healthy_sled_id = SledUuid::new_v4(); + let checks = UpdateContactSupportChecksInput { + inventory: Arc::new(fake_collection_with_ids( + healthy_sled_id, + healthy_zpools(), + healthy_services(), + )), + stuck_sagas: Ok(vec![]), + blueprint, + current_target_version: Some(fake_target_version()), + internal_update_status: internal_update_status_with_missing_sleds( + [missing_sled_id], + [healthy_sled_id], + ), + }; + + let expected = UpdateStatusProblems { + missing_sleds: BTreeSet::from([missing_sled_id]), + ..Default::default() + }; + assert_eq!(checks.problems(), expected); + + logctx.cleanup_successful(); + } } From a3874ea61c371b1d9b186896add2f2b442421757 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 21 May 2026 21:13:22 +1200 Subject: [PATCH 2/3] clean up --- nexus/src/app/update.rs | 98 +---------------------------------------- 1 file changed, 2 insertions(+), 96 deletions(-) diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index 6f633a1c5f4..a84da985c8f 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -541,16 +541,8 @@ impl super::Nexus { ) .await?; - let components_by_release_version = self - .component_version_counts( - // opctx, - // &db_target_release, - // current_tuf_repo, - // &inventory, - // TODO-K: Is this clone fine? Should I just use a reference? - internal_status.clone(), - ) - .await?; + let components_by_release_version = + self.component_version_counts(internal_status.clone()).await?; let blueprint_target = self .update_status @@ -629,8 +621,6 @@ impl super::Nexus { UpdateActivityState::Idle | UpdateActivityState::Stuck => {} }; - // TODO-K: Add missing sleds here - let stuck_sagas = self .datastore() .saga_list_running_or_unwinding_older_than( @@ -755,92 +745,8 @@ impl super::Nexus { /// version async fn component_version_counts( &self, - // opctx: &OpContext, - // target_release: &nexus_db_model::TargetRelease, - // current_tuf_repo: Option, - // inventory: &Arc, status: internal_views::UpdateStatus, ) -> Result, Error> { - // // Build current TargetReleaseDescription, defaulting to Initial if - // // there is no tuf repo ID which, based on DB constraints, happens if - // // and only if target_release_source is 'unspecified', which should only - // // happen in the initial state before any target release has been set - // let curr_target_desc = match current_tuf_repo { - // Some(repo) => { - // TargetReleaseDescription::TufRepo(repo.into_external()) - // } - // None => TargetReleaseDescription::Initial, - // }; - // - // // Get previous target release (if it exists). Build the "prev" - // // TargetReleaseDescription from the previous generation if available, - // // otherwise fall back to Initial. - // let prev_repo_id = - // if let Some(prev_gen) = target_release.generation.prev() { - // self.datastore() - // .target_release_get_generation(opctx, Generation(prev_gen)) - // .await - // .internal_context("fetching previous target release")? - // .and_then(|r| r.tuf_repo_id) - // } else { - // None - // }; - // - // // It should never happen that a target release other than the initial - // // one with target_release_source unspecified should be missing a - // // tuf_repo_id. So if we have a tuf_repo_id for the previous target - // // release, we should always have one for the current target. - // if prev_repo_id.is_some() && target_release.tuf_repo_id.is_none() { - // return Err(Error::internal_error( - // "Target release has no tuf repo but previous release has one", - // )); - // } - // - // let prev_target_desc = match prev_repo_id { - // Some(id) => TargetReleaseDescription::TufRepo( - // self.datastore() - // .tuf_repo_get_by_id(opctx, id.into()) - // .await? - // .into_external(), - // ), - // None => TargetReleaseDescription::Initial, - // }; - // - // // Get the list of sleds that should be reported as a part of the update - // // status. (In particular, this allows us to filter out sleds that are - // // physically present but not part of the cluster, as well as add - // // "unknown" counts for sleds that ought to be present but aren't.) - // let expected_sleds = self - // .datastore() - // .sled_list_all_batched( - // opctx, - // SledFilter::SpsUpdatedByReconfigurator, - // ) - // .await? - // .iter() - // .map(|sled| { - // ( - // BaseboardId { - // part_number: sled.part_number().to_string(), - // serial_number: sled.serial_number().to_string(), - // }, - // sled.id(), - // ) - // }) - // .collect(); - // - // // It's weird to use the internal view this way. It would feel more - // // correct to extract shared logic and call it in both places. On the - // // other hand, that sharing would be boilerplatey and not add much yet. - // // So for now, use the internal view, but plan to extract shared logic - // // or do our own thing here once things settle. - // let status = internal_views::UpdateStatus::new( - // &prev_target_desc, - // &curr_target_desc, - // &expected_sleds, - // &inventory, - // ); - let sled_versions = status.sleds.into_iter().flat_map(|sled| { let zone_versions = sled.zones.into_iter().map(|zone| zone.version); From 426815c4f1445ee5d50bc0ca295596bde86679c6 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 21 May 2026 21:38:31 +1200 Subject: [PATCH 3/3] add some comments --- nexus/src/app/update.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index a84da985c8f..38bbf296a60 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -126,6 +126,8 @@ impl UpdateContactSupportChecksInput { .sleds .iter() .filter(|sled| { + // `unknown()` returns the represenation of the update status + // for a given sled ID that isn't present in inventory. **sled == internal_views::SledAgentUpdateStatus::unknown( sled.sled_id,