From f1ba0a22a961e8bdae110c7b1a796bc56dfd10a8 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 11 Jun 2026 01:39:36 -0700 Subject: [PATCH] feat(dstack-util): mix gcp vTPM AK pubkey into instance_id instance_id is derived from instance_id_seed, which is persisted on the data disk. On GCP a VM can be cloned from a disk image / snapshot, so every clone inherits the same seed and thus the same instance_id, letting multiple running VMs share one identity. On GCP, mix the public key of the pre-provisioned vTPM Attestation Key into the instance_id. The AK is derived deterministically from the per-instance Endorsement seed held in the vTPM (not on the data disk), so it is stable across reboot/stop-start but fresh on a disk clone. We hash the AK public area rather than its certificate so the binding is immune to certificate re-issuance: a re-signed cert carries new serial/ validity/signature bytes for the same key, which would otherwise change instance_id without a clone. (Observed cert validity is ~30 years from instance creation, so re-issuance is unlikely, but the pubkey removes the dependency entirely.) tpm-attest: expose the AK public area on LoadedAk (previously discarded). Verified on real c3-standard-4 TDX confidential VMs: - reboot: AK unchanged - stop/start: AK unchanged - clone from disk image: AK differs Fails closed: if GCP is detected but the AK cannot be loaded, error instead of silently falling back to the seed-only id. Other platforms are unaffected. --- dstack-util/src/system_setup.rs | 44 +++++++++++++++++++++++++++++++++ tpm-attest/src/gcp_ak.rs | 13 ++++++++-- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/dstack-util/src/system_setup.rs b/dstack-util/src/system_setup.rs index 8ab531fcf..2e23f98b0 100644 --- a/dstack-util/src/system_setup.rs +++ b/dstack-util/src/system_setup.rs @@ -706,6 +706,46 @@ fn truncate(s: &[u8], len: usize) -> &[u8] { } } +/// Return a platform-provided, per-instance value to mix into `instance_id`. +/// +/// `instance_id` is normally derived from `instance_id_seed`, which is persisted +/// on the data disk. That makes it unsafe on clouds where a VM can be cloned from +/// a disk image / snapshot: every clone inherits the same seed and therefore the +/// same `instance_id`. To keep `instance_id` unique per running VM we mix in a +/// per-instance value that lives outside the cloneable disk. +/// +/// On GCP we use the public key of the pre-provisioned vTPM Attestation Key. The AK +/// is derived deterministically from the per-instance Endorsement seed held in the +/// vTPM (not on the data disk), so a VM cloned from a disk image derives a different +/// AK while a reboot/stop-start of the same VM keeps it stable — exactly the property +/// we need. We hash the AK public area rather than its certificate so the binding is +/// immune to certificate re-issuance (a re-signed cert carries new serial/validity/ +/// signature bytes for the same key). +/// +/// Returns `Ok(None)` on platforms with no such binding; the `instance_id` then +/// keeps its previous seed-only derivation. Fails closed: if the platform is known +/// to provide a binding but it cannot be read, we error rather than silently fall +/// back to a duplication-prone id. +fn platform_instance_binding() -> Result>> { + use dstack_types::Platform; + match Platform::detect() { + Some(Platform::Gcp) => { + // Prefer the ECC AK, fall back to RSA (matches the quote path). + let ak = match tpm::load_gcp_ak_ecc(None) { + Ok(ak) => ak, + Err(ecc_err) => tpm::load_gcp_ak_rsa(None).with_context(|| { + format!("failed to load gcp vTPM AK (ecc error: {ecc_err:#})") + })?, + }; + if ak.pub_area.is_empty() { + bail!("gcp vTPM AK public area is empty"); + } + Ok(Some(sha256(&ak.pub_area).to_vec())) + } + _ => Ok(None), + } +} + fn emit_key_provider_info(provider_info: &KeyProviderInfo) -> Result<()> { info!("Key provider info: {provider_info:?}"); let provider_info_json = serde_json::to_vec(&provider_info)?; @@ -1361,6 +1401,10 @@ impl<'a> Stage0<'a> { } else { let mut id_path = instance_info.instance_id_seed.clone(); id_path.extend_from_slice(&instance_info.app_id); + if let Some(binding) = platform_instance_binding()? { + info!("mixing platform per-instance binding into instance_id"); + id_path.extend_from_slice(&binding); + } sha256(&id_path)[..20].to_vec() }; instance_info.instance_id = instance_id.clone(); diff --git a/tpm-attest/src/gcp_ak.rs b/tpm-attest/src/gcp_ak.rs index 56a703135..2e7766efc 100644 --- a/tpm-attest/src/gcp_ak.rs +++ b/tpm-attest/src/gcp_ak.rs @@ -32,6 +32,13 @@ pub struct LoadedAk { pub context: TpmContext, pub handle: u32, pub cert_nv_index: u32, + /// Marshaled TPMT_PUBLIC public area of the AK. + /// + /// Derived deterministically from the per-instance Endorsement seed, so it is + /// stable across reboot/stop-start but fresh on a disk clone. Unlike the AK + /// certificate it carries no serial/validity/signature, so it is immune to + /// certificate re-issuance. + pub pub_area: Vec, } /// Load GCP pre-provisioned ECC AK @@ -56,7 +63,7 @@ pub fn load_gcp_ak_ecc(tcti_path: Option<&str>) -> Result { ); // Create primary key under Endorsement hierarchy - let (handle, _public) = + let (handle, pub_area) = context.create_primary_from_template(tpm_rh::ENDORSEMENT, &template_bytes)?; debug!( @@ -68,6 +75,7 @@ pub fn load_gcp_ak_ecc(tcti_path: Option<&str>) -> Result { context, handle, cert_nv_index: gcp_nv_index::AK_ECC_CERT, + pub_area, }) } @@ -93,7 +101,7 @@ pub fn load_gcp_ak_rsa(tcti_path: Option<&str>) -> Result { ); // Create primary key under Endorsement hierarchy - let (handle, _public) = + let (handle, pub_area) = context.create_primary_from_template(tpm_rh::ENDORSEMENT, &template_bytes)?; debug!( @@ -105,6 +113,7 @@ pub fn load_gcp_ak_rsa(tcti_path: Option<&str>) -> Result { context, handle, cert_nv_index: gcp_nv_index::AK_RSA_CERT, + pub_area, }) }