From 65955161ff50c28401d504383396d7812ad40e33 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 4 May 2026 15:33:50 +0000 Subject: [PATCH] Block multiple sled reservations with the same gen If multiple instance-start sagas are concurrently attempting to allocate for the same instance, this temporarily results in multiple rows in `sled_resource_vmm` with different propolis ids for the same instance id. One of the instance-start sagas will succeed, where the other(s) will unwind (due to an "instance changed state before it could be started" error from `sis_move_to_starting`), and remove the `sled_resource_vmm` record that they added by matching on that saga's propolis id. There's never been a uniqueness constraint for instance id in the `sled_resource_vmm` table, because there can't be, otherwise we'd never be able to migrate an instance (which makes a new record on a different sled for the same instance). For an instance start that performs any new local storage allocation, this is a problem: the latent assumption in inserting / updating local storage related records is that this type of duplication could not occur, that if the insert succeeded then it means the allocation will only be performed once. Because this is not true the CTE will happily stomp all over the local storage allocation related records and that leads to the orphaning seen in the linked issue. The fix is to add a uniqueness constraint to `sled_resource_vmm` that ensures only one record for a given instance id plus the instance state generation number exists. This will not affect migration because the instance state generation is bumped in that case. This commit also changes the local storage related unit tests to clearly specify the ncpus and memory for the fake instances, as inspecting the `sled_resource_vmm` records produced by the test showed the resources didn't match the instance specification. Fixes oxidecomputer/customer-support#1184. --- dev-tools/omdb/src/bin/omdb/db.rs | 21 +- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/sled_resource_vmm.rs | 26 + nexus/db-queries/benches/harness/db_utils.rs | 1 + nexus/db-queries/src/db/datastore/affinity.rs | 1 + nexus/db-queries/src/db/datastore/sled.rs | 567 +++++++++++++----- .../src/db/queries/sled_reservation.rs | 15 +- .../output/sled_insert_resource_query.sql | 5 +- ...sert_resource_query_with_local_storage.sql | 5 +- nexus/db-schema/src/schema.rs | 1 + .../background/tasks/abandoned_vmm_reaper.rs | 13 + nexus/src/app/sagas/instance_common.rs | 7 +- nexus/src/app/sagas/instance_migrate.rs | 1 + nexus/src/app/sagas/instance_start.rs | 1 + nexus/src/app/sled.rs | 2 + schema/crdb/dbinit.sql | 13 +- .../up01.sql | 3 + .../up02.sql | 4 + .../up02.verify.sql | 2 + 19 files changed, 522 insertions(+), 169 deletions(-) create mode 100644 schema/crdb/sled-resource-vmm-instance-state-generation/up01.sql create mode 100644 schema/crdb/sled-resource-vmm-instance-state-generation/up02.sql create mode 100644 schema/crdb/sled-resource-vmm-instance-state-generation/up02.verify.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 22d553fbdcc..3fdc071432c 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -7865,25 +7865,24 @@ async fn cmd_db_vmm_info( include_sled_id: bool, ) { use db::model::ByteCount; - let db::model::SledResourceVmm { - id: _, - sled_id, - resources: - db::model::Resources { - hardware_threads, - rss_ram: ByteCount(rss), - reservoir_ram: ByteCount(reservoir), - }, - instance_id: _, - } = resource; + + let sled_id = &resource.sled_id; + let db::model::Resources { + hardware_threads, + rss_ram: ByteCount(rss), + reservoir_ram: ByteCount(reservoir), + } = &resource.resources; + const SLED_ID: &'static str = "sled ID"; const THREADS: &'static str = "hardware threads"; const RSS: &'static str = "RSS RAM"; const RESERVOIR: &'static str = "reservoir RAM"; const WIDTH: usize = const_max_len(&[SLED_ID, THREADS, RSS, RESERVOIR]); + if include_sled_id { println!(" {SLED_ID:>WIDTH$}: {sled_id}"); } + println!(" {THREADS:>WIDTH$}: {hardware_threads}"); println!(" {RSS:>WIDTH$}: {rss}"); println!(" {RESERVOIR:>WIDTH$}: {reservoir}"); diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 5caf71e9e28..6cdc3fac1b4 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(261, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(262, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(262, "sled-resource-vmm-instance-state-generation"), KnownVersion::new(261, "remove-add-zones-with-mupdate-override"), KnownVersion::new(260, "ereport-trim-serial-trailing-nulls"), KnownVersion::new(259, "vmm-failure-reason"), diff --git a/nexus/db-model/src/sled_resource_vmm.rs b/nexus/db-model/src/sled_resource_vmm.rs index c64716ff1b5..1dab83d85cd 100644 --- a/nexus/db-model/src/sled_resource_vmm.rs +++ b/nexus/db-model/src/sled_resource_vmm.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::Generation; use crate::typed_uuid::DbTypedUuid; use crate::{ByteCount, SqlU32}; use nexus_db_schema::schema::sled_resource_vmm; @@ -34,6 +35,14 @@ impl Resources { } } +#[derive(Clone, Debug)] +pub enum SledResourceVmmInstanceStateGeneration { + /// Row was created prior to Nexus recording this + Unspecified, + + Specified(Generation), +} + /// Describes sled resource usage by a VMM #[derive(Clone, Selectable, Queryable, Insertable, Debug)] #[diesel(table_name = sled_resource_vmm)] @@ -45,6 +54,9 @@ pub struct SledResourceVmm { pub resources: Resources, pub instance_id: Option, + + /// The instance's state_generation at the moment of reservation + instance_state_generation: Option, } impl SledResourceVmm { @@ -53,12 +65,14 @@ impl SledResourceVmm { instance_id: InstanceUuid, sled_id: SledUuid, resources: Resources, + instance_state_generation: Generation, ) -> Self { Self { id: id.into(), instance_id: Some(instance_id.into()), sled_id: sled_id.into(), resources, + instance_state_generation: Some(instance_state_generation), } } @@ -73,4 +87,16 @@ impl SledResourceVmm { pub fn instance_id(&self) -> Option { self.instance_id.map(|x| x.into()) } + + pub fn instance_state_generation( + &self, + ) -> SledResourceVmmInstanceStateGeneration { + match &self.instance_state_generation { + Some(generation) => { + SledResourceVmmInstanceStateGeneration::Specified(*generation) + } + + None => SledResourceVmmInstanceStateGeneration::Unspecified, + } + } } diff --git a/nexus/db-queries/benches/harness/db_utils.rs b/nexus/db-queries/benches/harness/db_utils.rs index 390568fb114..5d76ab9f4d7 100644 --- a/nexus/db-queries/benches/harness/db_utils.rs +++ b/nexus/db-queries/benches/harness/db_utils.rs @@ -73,6 +73,7 @@ pub async fn create_reservation( .sled_reservation_create( &opctx, instance_id, + nexus_db_model::Generation::new(), vmm_id, small_resource_request(), SledReservationConstraintBuilder::new().build(), diff --git a/nexus/db-queries/src/db/datastore/affinity.rs b/nexus/db-queries/src/db/datastore/affinity.rs index 7689ec06e39..c52afb6fcd2 100644 --- a/nexus/db-queries/src/db/datastore/affinity.rs +++ b/nexus/db-queries/src/db/datastore/affinity.rs @@ -1314,6 +1314,7 @@ mod tests { ByteCount::from_kibibytes_u32(1).into(), ByteCount::from_kibibytes_u32(1).into(), ), + nexus_db_model::Generation::new(), )) .execute_async( &*datastore.pool_connection_for_tests().await.unwrap(), diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 36872c074c6..29acaaa33bb 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -101,6 +101,8 @@ enum SledReservationError { group membership." )] RequiredAffinitySledNotValid, + #[error("Instance reservation already made for generation {generation}")] + ReservationExists { generation: external::Generation }, } impl From for external::Error { @@ -129,6 +131,11 @@ impl From for external::Error { | SledReservationError::ConflictingAntiAndAffinityConstraints => { external::Error::invalid_request(&msg) }, + // A concurrent request to place the same instance at the same + // generation number landed already. + SledReservationError::ReservationExists { generation: _ } => { + external::Error::conflict(&msg) + } } } } @@ -658,6 +665,11 @@ impl<'a> Iterator for CompleteLocalStorageAllocationLists<'a> { } } +/// This constraint prevents an instance from having multiple sled_resource_vmm +/// records for the same generation number. +const SINGLE_RESERVATION_CONSTRAINT: &'static str = + "single_vmm_reservation_per_generation"; + impl DataStore { /// Stores a new sled in the database. /// @@ -850,6 +862,7 @@ impl DataStore { &self, opctx: &OpContext, instance_id: InstanceUuid, + instance_state_generation: db::model::Generation, propolis_id: PropolisUuid, resources: db::model::Resources, constraints: db::model::SledReservationConstraints, @@ -857,6 +870,7 @@ impl DataStore { self.sled_reservation_create_inner( opctx, instance_id, + instance_state_generation, propolis_id, resources, constraints, @@ -875,6 +889,7 @@ impl DataStore { &self, opctx: &OpContext, instance_id: InstanceUuid, + instance_state_generation: db::model::Generation, propolis_id: PropolisUuid, resources: db::model::Resources, constraints: db::model::SledReservationConstraints, @@ -883,6 +898,7 @@ impl DataStore { let log = opctx.log.new(o!( "query" => "sled_reservation", "instance_id" => instance_id.to_string(), + "instance_state_generation" => instance_state_generation.to_string(), "propolis_id" => propolis_id.to_string(), )); @@ -890,22 +906,29 @@ impl DataStore { let conn = self.pool_connection_authorized(opctx).await?; - // Check if resource ID already exists - if so, return it. + // Check if resource with a matching propolis ID already exists - if so, + // return it. // // This check makes this function idempotent. Beyond this point, however // we rely on primary key constraints in the database to prevent // concurrent reservations for same propolis_id. use nexus_db_schema::schema::sled_resource_vmm::dsl as resource_dsl; - let old_resource = resource_dsl::sled_resource_vmm + let existing_resource = resource_dsl::sled_resource_vmm .filter(resource_dsl::id.eq(*propolis_id.as_untyped_uuid())) .select(SledResourceVmm::as_select()) - .limit(1) - .load_async(&*conn) - .await?; + .get_result_async(&*conn) + .await + .optional()?; - if !old_resource.is_empty() { - info!(&log, "sled reservation already occurred, returning"); - return Ok(old_resource[0].clone()); + if let Some(existing_resource) = existing_resource { + info!( + &log, + "existing sled reservation for this instance at generation \ + {:?}", + existing_resource.instance_state_generation(), + ); + + return Ok(existing_resource); } // Get a list of local storage disks attached to this instance @@ -1176,6 +1199,7 @@ impl DataStore { instance_id, sled_target, resources.clone(), + instance_state_generation, ); if !local_storage_allocation_required { @@ -1206,6 +1230,24 @@ impl DataStore { } } + Err(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::UniqueViolation, + error_info, + )) if error_info.constraint_name() + == Some(SINGLE_RESERVATION_CONSTRAINT) => + { + // The table already has a reservation for this instance + // id and generation number. + return Err( + SledReservationTransactionError::Reservation( + SledReservationError::ReservationExists { + generation: instance_state_generation + .into(), + }, + ), + ); + } + Err(e) => { if let Some(sentinel) = matches_sentinel(&e, &SLED_INSERT_QUERY_SENTINELS) @@ -1341,6 +1383,24 @@ impl DataStore { } } + Err(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::UniqueViolation, + error_info, + )) if error_info.constraint_name() + == Some(SINGLE_RESERVATION_CONSTRAINT) => + { + // The table already has a reservation for this + // instance id and generation number. + return Err( + SledReservationTransactionError::Reservation( + SledReservationError::ReservationExists { + generation: instance_state_generation + .into(), + }, + ), + ); + } + Err(e) => { if let Some(sentinel) = matches_sentinel( &e, @@ -2124,6 +2184,7 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( &opctx, InstanceUuid::new_v4(), + db::model::Generation::new(), PropolisUuid::new_v4(), resources.clone(), constraints, @@ -2145,6 +2206,7 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( &opctx, InstanceUuid::new_v4(), + db::model::Generation::new(), PropolisUuid::new_v4(), resources.clone(), constraints, @@ -2336,6 +2398,26 @@ pub(in crate::db::datastore) mod test { } } + fn from_local_storage_test_instance( + local_storage_test_instance: &LocalStorageTestInstance, + ) -> Self { + Self { + id: local_storage_test_instance.id, + groups: vec![], + force_onto_sled: None, + resources: db::model::Resources::new( + local_storage_test_instance.ncpus.into(), + local_storage_test_instance.memory.into(), + local_storage_test_instance.memory.into(), + ), + cpu_platform: None, + } + } + + fn resources(&self) -> db::model::Resources { + self.resources.clone() + } + // This is the first half of creating a sled reservation. // It can be called during tests trying to invoke contention manually. async fn find_targets( @@ -2377,6 +2459,7 @@ pub(in crate::db::datastore) mod test { datastore: &DataStore, propolis_id: PropolisUuid, sled_id: SledUuid, + state_generation: db::model::Generation, ) -> bool { assert!(self.force_onto_sled.is_none()); @@ -2385,6 +2468,7 @@ pub(in crate::db::datastore) mod test { self.id, sled_id, self.resources.clone(), + state_generation, ); let conn = datastore.pool_connection_for_tests().await.unwrap(); @@ -2490,6 +2574,7 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create_inner( &opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), instance.resources.clone(), constraints.build(), @@ -3305,6 +3390,7 @@ pub(in crate::db::datastore) mod test { &datastore, PropolisUuid::new_v4(), sleds[i].id(), + Generation::new(), ) .await, "Shouldn't have been able to insert into sled {i}" @@ -3318,6 +3404,7 @@ pub(in crate::db::datastore) mod test { &datastore, PropolisUuid::new_v4(), sleds[0].id(), + Generation::new(), ) .await ); @@ -3405,6 +3492,7 @@ pub(in crate::db::datastore) mod test { &datastore, PropolisUuid::new_v4(), sleds[0].id(), + Generation::new(), ) .await, "Shouldn't have been able to insert into sleds[0]" @@ -3417,6 +3505,7 @@ pub(in crate::db::datastore) mod test { &datastore, PropolisUuid::new_v4(), sleds[1].id(), + Generation::new(), ) .await ); @@ -3489,6 +3578,7 @@ pub(in crate::db::datastore) mod test { &datastore, PropolisUuid::new_v4(), sleds[i].id(), + Generation::new(), ) .await, "Shouldn't have been able to insert into sleds[i]" @@ -3502,6 +3592,7 @@ pub(in crate::db::datastore) mod test { &datastore, PropolisUuid::new_v4(), sleds[1].id(), + Generation::new(), ) .await ); @@ -3992,6 +4083,8 @@ pub(in crate::db::datastore) mod test { struct LocalStorageTestInstance { id: InstanceUuid, name: String, + ncpus: u16, + memory: external::ByteCount, affinity: Option<(Affinity, usize)>, disks: Vec, } @@ -4021,7 +4114,7 @@ pub(in crate::db::datastore) mod test { is_scrimlet: false, usable_hardware_threads: 128, usable_physical_ram: (64 << 30).try_into().unwrap(), - reservoir_size: (16 << 30).try_into().unwrap(), + reservoir_size: (56 << 30).try_into().unwrap(), cpu_family: SledCpuFamily::AmdMilan, }, Uuid::new_v4(), @@ -4138,6 +4231,8 @@ pub(in crate::db::datastore) mod test { &authz_project, instance.id, &instance.name.as_str(), + instance.ncpus, + instance.memory, ) .await; @@ -4395,6 +4490,8 @@ pub(in crate::db::datastore) mod test { authz_project: &authz::Project, instance_id: InstanceUuid, name: &str, + ncpus: u16, + memory: external::ByteCount, ) -> authz::Instance { datastore .project_create_instance( @@ -4408,8 +4505,8 @@ pub(in crate::db::datastore) mod test { name: name.parse().unwrap(), description: "It's an instance".into(), }, - ncpus: 2i64.try_into().unwrap(), - memory: external::ByteCount::from_gibibytes_u32(16), + ncpus: external::InstanceCpuCount(ncpus), + memory, hostname: "myhostname".try_into().unwrap(), user_data: Vec::new(), network_interfaces: @@ -4567,6 +4664,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -4578,7 +4677,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -4589,12 +4689,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -4672,6 +4769,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -4683,7 +4782,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // Add an allocation for this disk to the first sled's zpool set_local_storage_unencrypted_allocation( @@ -4707,12 +4807,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -4764,6 +4861,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -4775,7 +4874,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); set_local_storage_unencrypted_dataset_no_provision( datastore, @@ -4792,12 +4892,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -4847,6 +4944,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -4858,7 +4957,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If // we set a crucible dataset size_used of 300 GiB, then ensure a 512 GiB @@ -4880,12 +4980,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -4903,12 +5000,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -4960,6 +5054,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -4979,7 +5075,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -4990,12 +5087,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5067,6 +5161,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -5086,7 +5182,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -5097,12 +5194,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5178,6 +5272,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -5197,7 +5293,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If // we set the first U2's crucible dataset size_used of 300 GiB, then @@ -5219,12 +5316,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5275,12 +5369,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5356,6 +5447,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -5375,7 +5468,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // One of the local storage have been allocated already. @@ -5399,12 +5493,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5481,6 +5572,8 @@ pub(in crate::db::datastore) mod test { LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 32, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -5504,6 +5597,8 @@ pub(in crate::db::datastore) mod test { LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local2".to_string(), + ncpus: 32, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -5530,7 +5625,8 @@ pub(in crate::db::datastore) mod test { setup_local_storage_allocation_test(&opctx, datastore, &config).await; for instance in &config.instances { - let instance = Instance::new_with_id(instance.id); + let instance = + Instance::from_local_storage_test_instance(&instance); // the output of the find targets query does not currently take // required local storage allocations into account @@ -5540,12 +5636,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 32, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5623,6 +5716,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -5642,7 +5737,8 @@ pub(in crate::db::datastore) mod test { }; setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // One of the local storage have been allocated already to the first U2 @@ -5677,12 +5773,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5756,6 +5849,9 @@ pub(in crate::db::datastore) mod test { config.instances.push(LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: format!("inst{i}"), + // consume all the available threads + ncpus: 128, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks, }); @@ -5766,7 +5862,8 @@ pub(in crate::db::datastore) mod test { let mut vmms = vec![]; for (i, config_instance) in config.instances.iter().enumerate() { - let instance = Instance::new_with_id(config_instance.id); + let instance = + Instance::from_local_storage_test_instance(&config_instance); // the output of the find targets query does not currently take // required local storage allocations into account, but each VMM @@ -5777,12 +5874,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 128, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5867,6 +5961,9 @@ pub(in crate::db::datastore) mod test { config.instances.push(LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: format!("inst{i}"), + // consume all the available threads + ncpus: 128, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks, }); @@ -5882,7 +5979,9 @@ pub(in crate::db::datastore) mod test { let mut jhs = vec![]; for (i, config_instance) in chunk.iter().enumerate() { - let instance = Instance::new_with_id(config_instance.id); + let instance = Instance::from_local_storage_test_instance( + &config_instance, + ); let datastore = datastore.clone(); @@ -5896,12 +5995,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create_inner( &opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 128, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -5928,6 +6024,170 @@ pub(in crate::db::datastore) mod test { logctx.cleanup_successful(); } + /// Ensure that a full rack can have one VMM take all the U.2s on each sled, + /// where the sled reservations happen concurrently in chunks, and multiple + /// requests for the same instance occur. + #[tokio::test] + async fn local_storage_allocation_full_rack_concurrent_multi() { + let logctx = dev::test_setup_log( + "local_storage_allocation_full_rack_concurrent_multi", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut config = LocalStorageTest { + sleds: vec![], + affinity_groups: vec![], + anti_affinity_groups: vec![], + instances: vec![], + }; + + const MAX_U2_PER_INSTANCE: usize = 10; + + for i in 0..32 { + let mut u2s = vec![]; + + for n in 0..MAX_U2_PER_INSTANCE { + u2s.push(LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: format!("phys_{i}_{n}"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: format!( + "[fd00:1122:3344:{i}{n:02x}::1]:12345" + ) + .parse() + .unwrap(), + + local_storage_unencrypted_dataset_id: DatasetUuid::new_v4(), + }); + } + + config.sleds.push(LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: format!("sled_{i}"), + u2s, + }); + + let mut disks = vec![]; + + for n in 0..MAX_U2_PER_INSTANCE { + disks.push(LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from(format!("local-{i}-{n}")) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }); + } + + config.instances.push(LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: format!("inst{i}"), + // consume all the available threads + ncpus: 128, + memory: external::ByteCount::from_gibibytes_u32(16), + affinity: None, + disks, + }); + } + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + let mut vmms = vec![]; + + let mut success = 0; + + // Reserve the instances concurrently in chunks - 4 at a time, because + // qorb currently supports a max of 8, and we're issuing the same + // request twice. + for chunk in config.instances.chunks(4) { + let mut jhs = vec![]; + + for (i, config_instance) in chunk.iter().enumerate() { + let instance = Instance::from_local_storage_test_instance( + &config_instance, + ); + + // Issue the same request twice + + for _ in 0..2 { + let datastore = datastore.clone(); + + let opctx = opctx.child(BTreeMap::from([( + String::from("task"), + format!("{i}"), + )])); + + let jh = tokio::spawn({ + let resources = instance.resources(); + + async move { + datastore + .sled_reservation_create_inner( + &opctx, + instance.id, + db::model::Generation::new(), + PropolisUuid::new_v4(), + resources, + db::model::SledReservationConstraints::none( + ), + ) + .await + } + }); + + jhs.push(jh); + } + } + + for jh in jhs { + let vmm = match jh.await.unwrap() { + Ok(vmm) => { + success += 1; + vmm + } + + Err(SledReservationTransactionError::Reservation( + SledReservationError::NotFound, + )) => { + // eat this, double check number of VMMs later + continue; + } + + Err(SledReservationTransactionError::Reservation( + SledReservationError::ReservationExists { generation }, + )) if generation == external::Generation::new() => { + // eat this, it's expected due to concurrent requests + continue; + } + + Err(e) => panic!("{e:?}"), + }; + + vmms.push(vmm); + } + } + + assert_eq!(success, 32); + + let sleds: HashSet<_> = + vmms.into_iter().map(|vmm| vmm.sled_id).collect(); + + assert_eq!(sleds.len(), 32); + + validate_local_storage_allocations(&datastore).await; + + db.terminate().await; + logctx.cleanup_successful(); + } + /// Ensure that four sleds can have _many_ VMMs (enough to fully use all /// cpus) allocate local storage, where the sled reservations happen /// concurrently in chunks. @@ -6008,6 +6268,10 @@ pub(in crate::db::datastore) mod test { config.instances.push(LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: format!("inst{i}"), + ncpus: 2, + // 64 instances per sled, where each sled has 56 GB of reservoir + // RAM = 875 MB per instance + memory: external::ByteCount::from_mebibytes_u32(512), affinity: None, disks, }); @@ -6023,7 +6287,9 @@ pub(in crate::db::datastore) mod test { let mut jhs = vec![]; for (i, config_instance) in chunk.iter().enumerate() { - let instance = Instance::new_with_id(config_instance.id); + let instance = Instance::from_local_storage_test_instance( + &config_instance, + ); let datastore = datastore.clone(); @@ -6037,12 +6303,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create_inner( &opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 2, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -6143,6 +6406,8 @@ pub(in crate::db::datastore) mod test { LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: Some((Affinity::Positive, 0)), disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -6154,6 +6419,8 @@ pub(in crate::db::datastore) mod test { LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local2".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: Some((Affinity::Positive, 0)), disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -6170,7 +6437,8 @@ pub(in crate::db::datastore) mod test { // The first instance's sled reservation should succeed, there's enough // space for the disk - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -6181,12 +6449,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -6196,7 +6461,8 @@ pub(in crate::db::datastore) mod test { // the affinity group's policy is set to Fail, and there isn't enough // space for the second instance's disk on the one sled. - let instance = Instance::new_with_id(config.instances[1].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[1]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -6206,12 +6472,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -6270,6 +6533,8 @@ pub(in crate::db::datastore) mod test { LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: Some((Affinity::Negative, 0)), disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -6281,6 +6546,8 @@ pub(in crate::db::datastore) mod test { LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local2".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: Some((Affinity::Negative, 0)), disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -6296,7 +6563,8 @@ pub(in crate::db::datastore) mod test { // The first instance's sled reservation should succeed - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -6307,12 +6575,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -6323,7 +6588,8 @@ pub(in crate::db::datastore) mod test { // The second instance's sled reservation should not succeed, because // the anti-affinity group's policy is set to Fail. - let instance = Instance::new_with_id(config.instances[1].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[1]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -6333,12 +6599,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -6435,6 +6698,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -6459,7 +6724,8 @@ pub(in crate::db::datastore) mod test { // succeed, as it needs two local storage allocations (which it could // get if it was allowed to use sled_1). - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // the output of the find targets query does not currently take required // local storage allocations into account @@ -6474,12 +6740,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), constraints, ) .await @@ -6554,6 +6817,8 @@ pub(in crate::db::datastore) mod test { instances: vec![LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local".to_string(), + ncpus: 2, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![ LocalStorageTestInstanceDisk { @@ -6574,7 +6839,8 @@ pub(in crate::db::datastore) mod test { setup_local_storage_allocation_test(&opctx, datastore, &config).await; - let instance = Instance::new_with_id(config.instances[0].id); + let instance = + Instance::from_local_storage_test_instance(&config.instances[0]); // Detach the second disk from the instance let (.., authz_instance) = LookupPath::new(&opctx, datastore) @@ -6602,12 +6868,9 @@ pub(in crate::db::datastore) mod test { .sled_reservation_create( opctx, instance.id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 1, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + instance.resources(), db::model::SledReservationConstraints::none(), ) .await @@ -6687,6 +6950,9 @@ pub(in crate::db::datastore) mod test { id: InstanceUuid::new_v4(), name: "local".to_string(), affinity: None, + // consume _almost_ all available threads + ncpus: 96, + memory: external::ByteCount::from_gibibytes_u32(16), disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), name: external::Name::try_from("local".to_string()) @@ -6697,6 +6963,9 @@ pub(in crate::db::datastore) mod test { LocalStorageTestInstance { id: InstanceUuid::new_v4(), name: "local2".to_string(), + // consume _almost_ all available threads + ncpus: 96, + memory: external::ByteCount::from_gibibytes_u32(16), affinity: None, disks: vec![LocalStorageTestInstanceDisk { id: Uuid::new_v4(), @@ -6717,19 +6986,21 @@ pub(in crate::db::datastore) mod test { let datastore = db.datastore().clone(); let opctx = OpContext::for_tests(logctx.log.clone(), datastore.clone()); - let instance_id = config.instances[0].id; + + let instance = Instance::from_local_storage_test_instance( + &config.instances[0], + ); + let instance_id = instance.id; + let resources = instance.resources(); async move { datastore .sled_reservation_create( &opctx, instance_id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 96, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + resources, db::model::SledReservationConstraints::none(), ) .await @@ -6740,19 +7011,21 @@ pub(in crate::db::datastore) mod test { let datastore = db.datastore().clone(); let opctx = OpContext::for_tests(logctx.log.clone(), datastore.clone()); - let instance_id = config.instances[1].id; + + let instance = Instance::from_local_storage_test_instance( + &config.instances[1], + ); + let instance_id = instance.id; + let resources = instance.resources(); async move { datastore .sled_reservation_create( &opctx, instance_id, + db::model::Generation::new(), PropolisUuid::new_v4(), - db::model::Resources::new( - 96, - ByteCount::try_from(1024).unwrap(), - ByteCount::try_from(1024).unwrap(), - ), + resources, db::model::SledReservationConstraints::none(), ) .await diff --git a/nexus/db-queries/src/db/queries/sled_reservation.rs b/nexus/db-queries/src/db/queries/sled_reservation.rs index cdd1c3968d2..1c39150d6d2 100644 --- a/nexus/db-queries/src/db/queries/sled_reservation.rs +++ b/nexus/db-queries/src/db/queries/sled_reservation.rs @@ -6,6 +6,7 @@ use crate::db::model::Resources; use crate::db::model::SledResourceVmm; +use crate::db::model::SledResourceVmmInstanceStateGeneration; use crate::db::raw_query_builder::QueryBuilder; use crate::db::raw_query_builder::TrustedStr; use crate::db::raw_query_builder::TypedSqlQuery; @@ -833,13 +834,14 @@ pub fn sled_insert_resource_query( // Finally, perform the INSERT if it's still valid. query.sql(" - INSERT INTO sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) + INSERT INTO sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id, instance_state_generation) SELECT ").param().sql(", ").param().sql(", ").param().sql(", ").param().sql(", ").param().sql(", + ").param().sql(", ").param().sql(" WHERE EXISTS(SELECT 1 FROM insert_valid) ") @@ -848,7 +850,14 @@ pub fn sled_insert_resource_query( .bind::(resource.resources.hardware_threads) .bind::(resource.resources.rss_ram) .bind::(resource.resources.reservoir_ram) - .bind::(instance_id); + .bind::(instance_id) + .bind::, _>( + match resource.instance_state_generation() { + SledResourceVmmInstanceStateGeneration::Specified(generation) => + Some(generation), + SledResourceVmmInstanceStateGeneration::Unspecified => None, + } + ); query.query() } @@ -950,6 +959,7 @@ mod test { external::ByteCount::from_gibibytes_u32(0), ), ), + model::Generation::new(), ); // with no local storage @@ -1022,6 +1032,7 @@ mod test { external::ByteCount::from_gibibytes_u32(0), ), ), + model::Generation::new(), ); let query = sled_insert_resource_query( diff --git a/nexus/db-queries/tests/output/sled_insert_resource_query.sql b/nexus/db-queries/tests/output/sled_insert_resource_query.sql index 354925ee63e..f6f40a15f28 100644 --- a/nexus/db-queries/tests/output/sled_insert_resource_query.sql +++ b/nexus/db-queries/tests/output/sled_insert_resource_query.sql @@ -116,8 +116,9 @@ WITH ) INSERT INTO - sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) + sled_resource_vmm + (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id, instance_state_generation) SELECT - $11, $12, $13, $14, $15, $16 + $11, $12, $13, $14, $15, $16, $17 WHERE EXISTS(SELECT 1 FROM insert_valid) diff --git a/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql index f1d6a6b6643..de1641894bf 100644 --- a/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql +++ b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql @@ -298,8 +298,9 @@ WITH ) INSERT INTO - sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) + sled_resource_vmm + (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id, instance_state_generation) SELECT - $45, $46, $47, $48, $49, $50 + $45, $46, $47, $48, $49, $50, $51 WHERE EXISTS(SELECT 1 FROM insert_valid) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 05d6c96b1db..6ca7893674a 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1103,6 +1103,7 @@ table! { rss_ram -> Int8, reservoir_ram -> Int8, instance_id -> Nullable, + instance_state_generation -> Nullable, } } diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index 40d6c04b2d7..76b3d67369f 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -196,6 +196,7 @@ mod tests { use super::*; use chrono::Utc; + use nexus_db_lookup::LookupPath; use nexus_db_model::ByteCount; use nexus_db_model::Generation; use nexus_db_model::Resources; @@ -203,6 +204,7 @@ mod tests { use nexus_db_model::Vmm; use nexus_db_model::VmmCpuPlatform; use nexus_db_model::VmmState; + use nexus_db_queries::authz; use nexus_test_utils::resource_helpers; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::InstanceUuid; @@ -255,19 +257,29 @@ mod tests { ) .await .expect("destroyed vmm record should be created successfully"); + let resources = Resources::new( 1, // Just require the bare non-zero amount of RAM. ByteCount::try_from(1024).unwrap(), ByteCount::try_from(1024).unwrap(), ); + let constraints = nexus_db_model::SledReservationConstraints::none(); + + let (.., db_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance.identity.id) + .fetch_for(authz::Action::Read) + .await + .unwrap(); + dbg!( datastore .sled_reservation_create( &opctx, InstanceUuid::from_untyped_uuid(instance.identity.id), + db_instance.state_generation, destroyed_vmm_id, resources.clone(), constraints, @@ -275,6 +287,7 @@ mod tests { .await .expect("sled reservation should be created successfully") ); + Self { destroyed_vmm_id } } diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 2b0e32d6629..c3be63e3674 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -12,8 +12,9 @@ use crate::app::instance_network::InstanceNetworkFilters; use http::StatusCode; use nexus_db_lookup::LookupPath; use nexus_db_model::{ - ByteCount, ExternalIp, InstanceState, IpAttachState, IpNet, NatEntry, - SledReservationConstraints, SledResourceVmm, VmmCpuPlatform, VmmState, + ByteCount, ExternalIp, Generation, InstanceState, IpAttachState, IpNet, + NatEntry, SledReservationConstraints, SledResourceVmm, VmmCpuPlatform, + VmmState, }; use nexus_db_queries::authz; use nexus_db_queries::db::datastore::ExternalSubnetBeginOpResult; @@ -42,6 +43,7 @@ pub(super) struct VmmAndSledIds { pub async fn reserve_vmm_resources( nexus: &Nexus, instance_id: InstanceUuid, + instance_state_generation: Generation, propolis_id: PropolisUuid, ncpus: u32, guest_memory: ByteCount, @@ -74,6 +76,7 @@ pub async fn reserve_vmm_resources( let resource = nexus .reserve_on_random_sled( instance_id, + instance_state_generation, propolis_id, resources, constraints, diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index a713a8335d3..aad57559d87 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -237,6 +237,7 @@ async fn sim_reserve_sled_resources( let resource = super::instance_common::reserve_vmm_resources( osagactx.nexus(), InstanceUuid::from_untyped_uuid(params.instance.id()), + params.instance.state_generation, propolis_id, u32::from(params.instance.ncpus.0.0), params.instance.memory, diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 0899ffcd474..22f5e4ad110 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -231,6 +231,7 @@ async fn sis_alloc_server( let resource = super::instance_common::reserve_vmm_resources( osagactx.nexus(), InstanceUuid::from_untyped_uuid(params.db_instance.id()), + params.db_instance.state_generation, propolis_id, u32::from(hardware_threads.0), reservoir_ram, diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index d2ee2f40c44..5732f7e0e7a 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -211,6 +211,7 @@ impl super::Nexus { pub(crate) async fn reserve_on_random_sled( &self, instance_id: InstanceUuid, + instance_state_generation: db::model::Generation, propolis_id: PropolisUuid, resources: db::model::Resources, constraints: db::model::SledReservationConstraints, @@ -219,6 +220,7 @@ impl super::Nexus { .sled_reservation_create( &self.opctx_alloc, instance_id, + instance_state_generation, propolis_id, resources, constraints, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 308379c85c5..61138906145 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -307,7 +307,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.sled_resource_vmm ( -- If we tried to backfill + make this column non-nullable while that saga -- was mid-execution, we would still have some rows in this table with nullable -- values that would be more complex to fix. - instance_id UUID + instance_id UUID, + + -- The instance's state generation number at the moment of reservation + instance_state_generation INT ); -- Allow looking up all VMM resources which reside on a sled @@ -316,6 +319,12 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_vmm_resource_by_sled ON omicron.public. id ); +-- Allow a single VMM reservation per instance state generation +CREATE UNIQUE INDEX IF NOT EXISTS single_vmm_reservation_per_generation ON omicron.public.sled_resource_vmm ( + instance_id, + instance_state_generation +); + -- Allow looking up all resources by instance CREATE INDEX IF NOT EXISTS lookup_vmm_resource_by_instance ON omicron.public.sled_resource_vmm ( instance_id @@ -8623,7 +8632,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '261.0.0', NULL) + (TRUE, NOW(), NOW(), '262.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/sled-resource-vmm-instance-state-generation/up01.sql b/schema/crdb/sled-resource-vmm-instance-state-generation/up01.sql new file mode 100644 index 00000000000..90127964822 --- /dev/null +++ b/schema/crdb/sled-resource-vmm-instance-state-generation/up01.sql @@ -0,0 +1,3 @@ +ALTER TABLE omicron.public.sled_resource_vmm + ADD COLUMN IF NOT EXISTS instance_state_generation INT + DEFAULT NULL; diff --git a/schema/crdb/sled-resource-vmm-instance-state-generation/up02.sql b/schema/crdb/sled-resource-vmm-instance-state-generation/up02.sql new file mode 100644 index 00000000000..0a78b4a8972 --- /dev/null +++ b/schema/crdb/sled-resource-vmm-instance-state-generation/up02.sql @@ -0,0 +1,4 @@ +CREATE UNIQUE INDEX IF NOT EXISTS single_vmm_reservation_per_generation ON omicron.public.sled_resource_vmm ( + instance_id, + instance_state_generation +); diff --git a/schema/crdb/sled-resource-vmm-instance-state-generation/up02.verify.sql b/schema/crdb/sled-resource-vmm-instance-state-generation/up02.verify.sql new file mode 100644 index 00000000000..b90c7e1eb74 --- /dev/null +++ b/schema/crdb/sled-resource-vmm-instance-state-generation/up02.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'sled_resource_vmm' AND index_name = 'single_vmm_reservation_per_generation')),'true','Schema change verification failed: index single_vmm_reservation_per_generation on table sled_resource_vmm does not exist') AS BOOL);