diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index 5535c33a34e..38bbf296a60 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -103,6 +103,7 @@ struct UpdateContactSupportChecksInput { blueprint: Arc, // None when no target release has ever been set on the system. current_target_version: Option, + internal_update_status: internal_views::UpdateStatus, } impl UpdateContactSupportChecksInput { @@ -120,6 +121,21 @@ impl UpdateContactSupportChecksInput { } }; + let missing_sleds: BTreeSet = self + .internal_update_status + .sleds + .iter() + .filter(|sled| { + // `unknown()` returns the represenation of the update status + // for a given sled ID that isn't present in inventory. + **sled + == internal_views::SledAgentUpdateStatus::unknown( + sled.sled_id, + ) + }) + .map(|sled| sled.sled_id) + .collect(); + let (stuck_sagas, stuck_sagas_error_message) = match &self.stuck_sagas { Ok(sagas) => ( sagas @@ -162,6 +178,7 @@ impl UpdateContactSupportChecksInput { stale_inventory_last_collection_time_done, unhealthy_zpools_by_sled, enabled_smf_services_not_online_by_sled, + missing_sleds, } } } @@ -192,6 +209,7 @@ struct UpdateStatusProblems { /// Enabled SMF services that are not in an `online` state. enabled_smf_services_not_online_by_sled: BTreeMap, + missing_sleds: BTreeSet, } impl UpdateStatusProblems { @@ -203,6 +221,7 @@ impl UpdateStatusProblems { stale_inventory_last_collection_time_done, unhealthy_zpools_by_sled, enabled_smf_services_not_online_by_sled, + missing_sleds, } = self; stuck_sagas.is_empty() && stuck_sagas_error_message.is_none() @@ -210,6 +229,7 @@ impl UpdateStatusProblems { && stale_inventory_last_collection_time_done.is_none() && unhealthy_zpools_by_sled.is_empty() && enabled_smf_services_not_online_by_sled.is_empty() + && missing_sleds.is_empty() } } @@ -228,6 +248,7 @@ impl KV for UpdateStatusProblems { stale_inventory_last_collection_time_done, unhealthy_zpools_by_sled, enabled_smf_services_not_online_by_sled, + missing_sleds, } = self; if !stuck_sagas.is_empty() { @@ -270,6 +291,12 @@ impl KV for UpdateStatusProblems { &format_args!("{:?}", enabled_smf_services_not_online_by_sled), )?; } + if !missing_sleds.is_empty() { + serializer.emit_arguments( + "missing_sleds".into(), + &format_args!("{:?}", missing_sleds), + )?; + } Ok(()) } } @@ -507,8 +534,8 @@ impl super::Nexus { return Err(Error::internal_error("No inventory collection found")); }; - let components_by_release_version = self - .component_version_counts( + let internal_status = self + .get_internal_update_status( opctx, &db_target_release, current_tuf_repo, @@ -516,6 +543,9 @@ impl super::Nexus { ) .await?; + let components_by_release_version = + self.component_version_counts(internal_status.clone()).await?; + let blueprint_target = self .update_status .latest_blueprint @@ -543,6 +573,7 @@ impl super::Nexus { inventory, Arc::clone(&blueprint_target.blueprint), target_release.as_ref().map(|t| &t.version), + internal_status, ) .await?; @@ -574,6 +605,7 @@ impl super::Nexus { inventory: Arc, blueprint: Arc, current_target_version: Option<&Version>, + internal_update_status: internal_views::UpdateStatus, ) -> Result { // If an update is in progress but not stuck, the remaining checks // could fail mid-update and shouldn't trigger a contact-support @@ -604,6 +636,7 @@ impl super::Nexus { stuck_sagas, blueprint, current_target_version: current_target_version.cloned(), + internal_update_status, }; let problems = checks.problems(); @@ -620,15 +653,13 @@ impl super::Nexus { Ok(contact_support) } - /// Build a map of version strings to the number of components on that - /// version - async fn component_version_counts( + async fn get_internal_update_status( &self, opctx: &OpContext, target_release: &nexus_db_model::TargetRelease, current_tuf_repo: Option, inventory: &Arc, - ) -> Result, Error> { + ) -> Result { // Build current TargetReleaseDescription, defaulting to Initial if // there is no tuf repo ID which, based on DB constraints, happens if // and only if target_release_source is 'unspecified', which should only @@ -709,6 +740,15 @@ impl super::Nexus { &inventory, ); + Ok(status) + } + + /// Build a map of version strings to the number of components on that + /// version + async fn component_version_counts( + &self, + status: internal_views::UpdateStatus, + ) -> Result, Error> { let sled_versions = status.sleds.into_iter().flat_map(|sled| { let zone_versions = sled.zones.into_iter().map(|zone| zone.version); @@ -1033,6 +1073,58 @@ mod test { problems } + // Build an empty `internal_views::UpdateStatus` for tests that don't care + // about the missing-sleds check. + fn empty_internal_update_status() -> internal_views::UpdateStatus { + internal_views::UpdateStatus { + mgs_driven: iddqd::IdOrdMap::new(), + sleds: iddqd::IdOrdMap::new(), + } + } + + // Build a `SledAgentUpdateStatus` that looks "healthy" (i.e. has a known + // boot disk and known slot versions). + fn healthy_sled_agent_update_status( + sled_id: SledUuid, + ) -> internal_views::SledAgentUpdateStatus { + internal_views::SledAgentUpdateStatus { + sled_id, + zones: iddqd::IdOrdMap::new(), + host_phase_2: internal_views::HostPhase2Status { + boot_disk: Ok(omicron_common::disk::M2Slot::A), + slot_a_version: internal_views::TufRepoVersion::Version( + fake_target_version(), + ), + slot_b_version: internal_views::TufRepoVersion::Version( + fake_target_version(), + ), + }, + } + } + + // Build an `internal_views::UpdateStatus` whose `sleds` contains an + // "unknown" entry for each provided missing sled id, plus a healthy entry + // for each provided healthy sled id. + fn internal_update_status_with_missing_sleds( + missing_sled_ids: impl IntoIterator, + healthy_sled_ids: impl IntoIterator, + ) -> internal_views::UpdateStatus { + let mut sleds: iddqd::IdOrdMap = + missing_sled_ids + .into_iter() + .map(internal_views::SledAgentUpdateStatus::unknown) + .collect(); + for sled_id in healthy_sled_ids { + sleds + .insert_unique(healthy_sled_agent_update_status(sled_id)) + .expect("sled ids are unique"); + } + internal_views::UpdateStatus { + mgs_driven: iddqd::IdOrdMap::new(), + sleds, + } + } + #[nexus_test(server = crate::Server)] async fn test_contact_support_healthy_system( cptestctx: &ControlPlaneTestContext, @@ -1064,7 +1156,13 @@ mod test { // should be false assert!( !nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1098,7 +1196,13 @@ mod test { // should be true assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1132,7 +1236,13 @@ mod test { // support should be true assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1166,7 +1276,13 @@ mod test { fake_blueprint(&cptestctx.logctx.log, &version, Utc::now(), false); assert!( nexus - .contact_support(&opctx, inventory, blueprint, None,) + .contact_support( + &opctx, + inventory, + blueprint, + None, + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1201,7 +1317,13 @@ mod test { // support should be true assert!( nexus - .contact_support(&opctx, inventory, blueprint, None,) + .contact_support( + &opctx, + inventory, + blueprint, + None, + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1240,7 +1362,13 @@ mod test { // update is considered stuck and contact support is true. assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) .await .unwrap() ); @@ -1285,11 +1413,21 @@ mod test { true, ); // Every health check is unhealthy: stuck saga, stuck update, stale - // inventory, unhealthy zpools, and unhealthy SMF services. Contact - // support should be true. + // inventory, unhealthy zpools, and unhealthy SMF services, plus a + // missing sled. Contact support should be true. + let missing_sled_id = SledUuid::new_v4(); assert!( nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + internal_update_status_with_missing_sleds( + [missing_sled_id], + [sled_id()], + ), + ) .await .unwrap() ); @@ -1324,7 +1462,57 @@ mod test { fake_blueprint(&cptestctx.logctx.log, &version, Utc::now(), true); assert!( !nexus - .contact_support(&opctx, inventory, blueprint, Some(&version),) + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + empty_internal_update_status(), + ) + .await + .unwrap() + ); + } + + #[nexus_test(server = crate::Server)] + async fn test_contact_support_missing_sleds( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let opctx = fake_opctx(cptestctx); + insert_fake_collection( + cptestctx, + &opctx, + healthy_zpools(), + healthy_services(), + ) + .await; + let inventory = Arc::new( + nexus + .datastore() + .inventory_get_latest_collection(&opctx) + .await + .unwrap() + .unwrap(), + ); + let version = fake_target_version(); + let blueprint = + fake_blueprint(&cptestctx.logctx.log, &version, Utc::now(), false); + // The system is otherwise healthy, but a sled we expected to see in + // inventory is missing. Contact support should be true. + let missing_sled_id = SledUuid::new_v4(); + assert!( + nexus + .contact_support( + &opctx, + inventory, + blueprint, + Some(&version), + internal_update_status_with_missing_sleds( + [missing_sled_id], + [sled_id()], + ), + ) .await .unwrap() ); @@ -1378,6 +1566,7 @@ mod test { inventory, blueprint, Some(&target_release), + empty_internal_update_status(), ) .await .unwrap() @@ -1403,6 +1592,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; assert_eq!(checks.problems(), UpdateStatusProblems::default()); @@ -1431,6 +1621,7 @@ mod test { stuck_sagas: Ok(vec![saga]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1463,6 +1654,7 @@ mod test { stuck_sagas: Err(err), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1495,6 +1687,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1529,6 +1722,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; assert_eq!(checks.problems(), UpdateStatusProblems::default()); @@ -1558,6 +1752,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1594,6 +1789,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected_zpool = Zpool { @@ -1636,6 +1832,7 @@ mod test { stuck_sagas: Ok(vec![]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected = UpdateStatusProblems { @@ -1677,6 +1874,7 @@ mod test { stuck_sagas: Ok(vec![saga]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: empty_internal_update_status(), }; let expected_zpool = Zpool { @@ -1728,11 +1926,16 @@ mod test { Utc::now() - STALE_INVENTORY_THRESHOLD - TimeDelta::seconds(10); collection.time_done = collection_time_done; + let missing_sled_id = SledUuid::new_v4(); let checks = UpdateContactSupportChecksInput { inventory: Arc::new(collection), stuck_sagas: Ok(vec![saga]), blueprint, current_target_version: Some(fake_target_version()), + internal_update_status: internal_update_status_with_missing_sleds( + [missing_sled_id], + [sled_id], + ), }; let expected_zpool = Zpool { @@ -1757,9 +1960,48 @@ mod test { enabled_smf_services_not_online_by_sled: BTreeMap::from([( sled_id, services, )]), + missing_sleds: BTreeSet::from([missing_sled_id]), }; assert_eq!(normalise_zpool_times(checks.problems()), expected); logctx.cleanup_successful(); } + + #[test] + fn test_problems_missing_sleds() { + let logctx = test_setup_log("test_problems_missing_sleds"); + let blueprint = fake_blueprint( + &logctx.log, + &fake_target_version(), + Utc::now(), + false, + ); + + // Two sleds expected: one missing from inventory, one present and + // healthy. Only the missing sled should be picked up. + let missing_sled_id = SledUuid::new_v4(); + let healthy_sled_id = SledUuid::new_v4(); + let checks = UpdateContactSupportChecksInput { + inventory: Arc::new(fake_collection_with_ids( + healthy_sled_id, + healthy_zpools(), + healthy_services(), + )), + stuck_sagas: Ok(vec![]), + blueprint, + current_target_version: Some(fake_target_version()), + internal_update_status: internal_update_status_with_missing_sleds( + [missing_sled_id], + [healthy_sled_id], + ), + }; + + let expected = UpdateStatusProblems { + missing_sleds: BTreeSet::from([missing_sled_id]), + ..Default::default() + }; + assert_eq!(checks.problems(), expected); + + logctx.cleanup_successful(); + } }