From 49b6d9b9ecc32b9da2ea07e04353716496f71790 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:39:55 -0700 Subject: [PATCH 01/42] engineering: Add AZL4 distro detection and extend GRUB update path Implements AzureLinuxRelease::AzL4 variant, VERSION_ID 4.x parsing, ID_LIKE=fedora matching, updated GRUB match arms for AzL3|AzL4, and image_distro() fallback to host os-release. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/mkinitrd.rs | 2 + crates/osutils/src/osrelease.rs | 47 +++++++++++++++++++++++ crates/osutils/src/testutils/osrelease.rs | 25 ++++++++++++ crates/trident/src/engine/boot/grub.rs | 9 +++-- crates/trident/src/engine/context/mod.rs | 10 ++++- 5 files changed, 88 insertions(+), 5 deletions(-) diff --git a/crates/osutils/src/mkinitrd.rs b/crates/osutils/src/mkinitrd.rs index c6ab3d2e10..d01831826f 100644 --- a/crates/osutils/src/mkinitrd.rs +++ b/crates/osutils/src/mkinitrd.rs @@ -118,6 +118,8 @@ mod functional_test { fn test_regenerate_initrd() { let pattern = if osrelease::is_azl3().unwrap() { "/boot/initramfs-*.azl3.img" + } else if osrelease::is_azl4().unwrap() { + "/boot/initramfs-*.azl4.img" } else { "/boot/initrd.img-*" }; diff --git a/crates/osutils/src/osrelease.rs b/crates/osutils/src/osrelease.rs index e51926e745..c39981c6f7 100644 --- a/crates/osutils/src/osrelease.rs +++ b/crates/osutils/src/osrelease.rs @@ -31,6 +31,11 @@ pub fn is_azl3() -> Result { Ok(OsRelease::read()?.get_distro().is_azl3()) } +/// Returns whether the host is running Azure Linux 4. +pub fn is_azl4() -> Result { + Ok(OsRelease::read()?.get_distro().is_azl4()) +} + /// Represents the contents of the /etc/os-release file. /// /// See @@ -146,6 +151,8 @@ impl OsRelease { AzureLinuxRelease::AzL2 } else if v.starts_with("3.") { AzureLinuxRelease::AzL3 + } else if v.starts_with("4.") { + AzureLinuxRelease::AzL4 } else { trace!("Unknown Azure Linux release: {v}"); AzureLinuxRelease::Other @@ -342,6 +349,10 @@ impl Distro { self == &Distro::AzureLinux(AzureLinuxRelease::AzL3) } + pub fn is_azl4(&self) -> bool { + self == &Distro::AzureLinux(AzureLinuxRelease::AzL4) + } + pub fn is_acl(&self) -> bool { self == &Distro::AzureContainerLinux } @@ -354,6 +365,7 @@ pub enum AzureLinuxRelease { Other, AzL2, AzL3, + AzL4, } #[cfg(test)] @@ -429,6 +441,41 @@ mod tests { ); } + #[test] + fn test_parse_azl4() { + let data = indoc::indoc! { + r#" + NAME="Azure Linux" + VERSION="4.0 (Four Alpha2)" + RELEASE_TYPE=development + ID=azurelinux + ID_LIKE=fedora + VERSION_ID="4.0" + VERSION_CODENAME="" + PRETTY_NAME="Azure Linux 4.0 (Four Alpha2)" + ANSI_COLOR="0;38;2;60;110;180" + LOGO=azurelinux-logo-icon + CPE_NAME="cpe:/o:azurelinuxproject:azurelinux:4.0" + DEFAULT_HOSTNAME="azurelinux" + HOME_URL="https://aka.ms/azurelinux" + DOCUMENTATION_URL="https://aka.ms/azurelinux" + SUPPORT_URL="https://aka.ms/azurelinux" + BUG_REPORT_URL="https://aka.ms/azurelinux" + SUPPORT_END=2026-05-15 + "#, + }; + + let os_release = OsRelease::parse(data); + assert_eq!(os_release.id, Some("azurelinux".to_string())); + assert_eq!(os_release.version_id, Some("4.0".to_string())); + assert_eq!(os_release.id_like, Some("fedora".to_string())); + assert_eq!(os_release.release_type, Some("development".to_string())); + assert_eq!( + os_release.get_distro(), + Distro::AzureLinux(AzureLinuxRelease::AzL4) + ); + } + #[test] fn test_parse_extension_release() { let data = indoc::indoc! { diff --git a/crates/osutils/src/testutils/osrelease.rs b/crates/osutils/src/testutils/osrelease.rs index 6feff02bc6..27a2e5b173 100644 --- a/crates/osutils/src/testutils/osrelease.rs +++ b/crates/osutils/src/testutils/osrelease.rs @@ -38,11 +38,36 @@ const AZURE_LINUX_3_OS_RELEASE: &str = indoc::indoc! { "#, }; +/// Azure Linux 4.0 sample os-release file. +const AZURE_LINUX_4_OS_RELEASE: &str = indoc::indoc! { + r#" + NAME="Azure Linux" + VERSION="4.0 (Cloud Variant Beta)" + RELEASE_TYPE=development + ID=azurelinux + ID_LIKE=fedora + VERSION_ID="4.0" + VERSION_CODENAME="" + PRETTY_NAME="Azure Linux 4.0 (Cloud Variant Beta)" + ANSI_COLOR="0;38;2;60;110;180" + LOGO=azurelinux-logo-icon + CPE_NAME="cpe:/o:azurelinuxproject:azurelinux:4.0" + DEFAULT_HOSTNAME="azurelinux" + HOME_URL="https://aka.ms/azurelinux" + DOCUMENTATION_URL="https://aka.ms/azurelinux" + SUPPORT_URL="https://aka.ms/azurelinux" + BUG_REPORT_URL="https://aka.ms/azurelinux" + VARIANT="Cloud Variant" + VARIANT_ID=cloud + "#, +}; + /// Creates a mock /etc/os-release file with the given Azure Linux release. pub fn make_mock_os_release(root_path: &Path, azl_release: AzureLinuxRelease) -> Result<(), Error> { let os_release_content = match azl_release { AzureLinuxRelease::AzL2 => AZURE_LINUX_2_OS_RELEASE, AzureLinuxRelease::AzL3 => AZURE_LINUX_3_OS_RELEASE, + AzureLinuxRelease::AzL4 => AZURE_LINUX_4_OS_RELEASE, AzureLinuxRelease::Other => bail!("Unsupported Azure Linux release 'other'"), }; diff --git a/crates/trident/src/engine/boot/grub.rs b/crates/trident/src/engine/boot/grub.rs index b345f5c315..fb25b59c8f 100644 --- a/crates/trident/src/engine/boot/grub.rs +++ b/crates/trident/src/engine/boot/grub.rs @@ -63,9 +63,10 @@ pub(super) fn update_configs(ctx: &EngineContext) -> Result<(), Error> { let boot_grub_config_path = Path::new(ROOT_MOUNT_POINT_PATH).join(GRUB2_CONFIG_RELATIVE_PATH); // Update GRUB config on the boot device (volume holding /boot) - match ctx.host_os_release.get_distro() { - Distro::AzureLinux(AzureLinuxRelease::AzL3) => { - update_grub_config_azl3(ctx, &root_device_path, &boot_grub_config_path)?; + // Use the *image* distro (the OS being installed), not the host (MOS ISO). + match ctx.image_distro() { + Distro::AzureLinux(AzureLinuxRelease::AzL3 | AzureLinuxRelease::AzL4) => { + update_grub_config(ctx, &root_device_path, &boot_grub_config_path)?; } d => bail!("Unsupported distro for GRUB config update: {d:?}"), @@ -86,7 +87,7 @@ pub(super) fn update_configs(ctx: &EngineContext) -> Result<(), Error> { } /// Updates the GRUB config for Azure Linux 3.0 using OS modifier. -fn update_grub_config_azl3( +fn update_grub_config( ctx: &EngineContext, root_device_path: &Path, boot_grub_config_path: &Path, diff --git a/crates/trident/src/engine/context/mod.rs b/crates/trident/src/engine/context/mod.rs index 73fe61f4e3..f873ff9478 100644 --- a/crates/trident/src/engine/context/mod.rs +++ b/crates/trident/src/engine/context/mod.rs @@ -441,8 +441,16 @@ impl EngineContext { } /// Retrieves the distribution of the OS image. + /// + /// Prefers the image's own os-release (e.g., from the COSI being installed). + /// Falls back to the host os-release when no image is available (functional + /// tests, runtime operations outside an install flow). pub(crate) fn image_distro(&self) -> Distro { - self.image_os_release().get_distro() + let distro = self.image_os_release().get_distro(); + match distro { + Distro::Other => self.host_os_release.get_distro(), + d => d, + } } } From cca2a4c31ace91190e67af498178ede67b16a655 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:41:07 -0700 Subject: [PATCH 02/42] fix: Only fall back to host distro when no image is mounted image_distro() was falling back to the host os-release whenever the image's distro was Distro::Other. This silently masked unrecognized distros as the host distro, causing GRUB config to be written for the wrong OS. Now: if an image is mounted (self.image.is_some()), always use the image's distro. Fallback to host only fires when no image is present at all (functional tests, runtime operations). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/engine/context/mod.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/trident/src/engine/context/mod.rs b/crates/trident/src/engine/context/mod.rs index f873ff9478..4632acabc0 100644 --- a/crates/trident/src/engine/context/mod.rs +++ b/crates/trident/src/engine/context/mod.rs @@ -443,13 +443,17 @@ impl EngineContext { /// Retrieves the distribution of the OS image. /// /// Prefers the image's own os-release (e.g., from the COSI being installed). - /// Falls back to the host os-release when no image is available (functional - /// tests, runtime operations outside an install flow). + /// Falls back to the host os-release only when no image is mounted + /// (functional tests, runtime operations outside an install flow). + /// + /// If an image IS present but its distro is unrecognized, the image's + /// distro is returned as-is (Distro::Other) so callers can bail + /// explicitly rather than silently using the host's distro. pub(crate) fn image_distro(&self) -> Distro { - let distro = self.image_os_release().get_distro(); - match distro { - Distro::Other => self.host_os_release.get_distro(), - d => d, + if self.image.is_some() { + self.image_os_release().get_distro() + } else { + self.host_os_release.get_distro() } } } From 460393ca1b3809e20b9b06b0f18464c2fbeb672a Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:04 -0700 Subject: [PATCH 03/42] engineering: Generic EFI vendor-dir discovery and AZL4 ESP support Adds is_azl4_or_later() helper, generic EFI vendor-dir discovery via grub-probe, and AZL4 ESP partition layout support. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 109 +++++++++++++++- crates/osutils/src/osrelease.rs | 31 +++++ crates/trident/src/subsystems/esp.rs | 178 ++++++++++++++++++++++++--- 3 files changed, 298 insertions(+), 20 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index 92782bbf78..dea58f2dd4 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -231,22 +231,51 @@ impl GrubConfig { } /// Update the search command in the GRUB config. + /// + /// Three variants of the GRUB stub `search` line exist in practice: + /// + /// 1. The upstream legacy form: `search -n -u -s` + /// 2. AZL3 / standard form: `search --no-floppy --fs-uuid --set=root ` + /// 3. AZL4 MIC-generated form: `search --fs-uuid --set=root ` + /// (the `--no-floppy` option is redundant on EFI machines, so AZL4's + /// grub stub omits it.) + /// + /// We rewrite *every* matching line with the corresponding form so that + /// stubs containing more than one variant (rare but possible during + /// distribution transitions) all get the new UUID. We bail only if no + /// regex matched any line. pub fn update_search(&mut self, uuid: &Uuid) -> Result<(), Error> { let re = Regex::new(r"(?m)^(\s*)search -n -u [\w-]+ -s$").unwrap(); let re2 = Regex::new(r"(?m)^(\s*)search --no-floppy --fs-uuid --set=root [\w-]+$").unwrap(); + let re3 = Regex::new(r"(?m)^(\s*)search --fs-uuid --set=root [\w-]+$").unwrap(); + let mut matched = false; if re.is_match(&self.contents) { self.contents = re - .replace(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) + .replace_all(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) .to_string(); - } else if re2.is_match(&self.contents) { + matched = true; + } + if re2.is_match(&self.contents) { self.contents = re2 - .replace( + .replace_all( &self.contents, &format!("${{1}}search --no-floppy --fs-uuid --set=root {uuid}"), ) .to_string(); - } else { + matched = true; + } + if re3.is_match(&self.contents) { + self.contents = re3 + .replace_all( + &self.contents, + &format!("${{1}}search --fs-uuid --set=root {uuid}"), + ) + .to_string(); + matched = true; + } + + if !matched { bail!( "Unable to find search command in '{}'", &self.path.display() @@ -953,6 +982,78 @@ mod tests { .unwrap(); } + #[test] + fn test_update_search_azl3_form() { + // AZL3 stubs use `search --no-floppy --fs-uuid --set=root `. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + set timeout=0 + search --no-floppy --fs-uuid --set=root deadbeef-cafe-babe-0000-111122223333 + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(grub_config.contents.contains(&format!( + "search --no-floppy --fs-uuid --set=root {new_uuid}" + ))); + assert!(!grub_config.contents.contains("deadbeef")); + } + + #[test] + fn test_update_search_azl4_form() { + // AZL4 MIC-generated stubs omit --no-floppy. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + set timeout=0 + search --fs-uuid --set=root deadbeef-cafe-babe-0000-111122223333 + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(grub_config + .contents + .contains(&format!("search --fs-uuid --set=root {new_uuid}"))); + assert!(!grub_config.contents.contains("deadbeef")); + // Must not accidentally insert --no-floppy. + assert!(!grub_config.contents.contains("--no-floppy")); + } + + #[test] + fn test_update_search_mixed_forms() { + // If both AZL3 and AZL4 forms appear (e.g. an image whose stub + // includes vendored fragments), both must be rewritten. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + search --no-floppy --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc + search --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(!grub_config.contents.contains("oldoldold")); + assert!(grub_config.contents.contains(&format!( + "search --no-floppy --fs-uuid --set=root {new_uuid}" + ))); + assert!(grub_config + .contents + .contains(&format!("search --fs-uuid --set=root {new_uuid}"))); + } + #[test] fn test_update_rootdevice() { // Define original GRUB config contents on target machine diff --git a/crates/osutils/src/osrelease.rs b/crates/osutils/src/osrelease.rs index c39981c6f7..5d8caafe29 100644 --- a/crates/osutils/src/osrelease.rs +++ b/crates/osutils/src/osrelease.rs @@ -353,6 +353,37 @@ impl Distro { self == &Distro::AzureLinux(AzureLinuxRelease::AzL4) } + /// Returns true for AZL4 and any later Azure Linux release. + /// + /// Use this when gating behavior on features that landed in AZL4 and + /// are expected to remain present in subsequent major releases (e.g. + /// AZL4 dropped the `grub2-efi-binary-noprefix` packaging convention; + /// AZL5+ is expected to keep that change). Strict `is_azl4()` would + /// silently regress to the AZL3 code path when AZL5 ships. + /// + /// The decision is based on the `AzureLinuxRelease` ordering AND, for + /// versions newer than what the parser recognizes, the numeric major + /// component of `version_id`. New major releases that the parser + /// hasn't been taught yet will fall through to `AzureLinuxRelease::Other`, + /// so we re-check `version_id` directly. + pub fn is_azl4_or_later(&self, version_id: Option<&str>) -> bool { + if let Distro::AzureLinux(rel) = self { + if matches!(rel, AzureLinuxRelease::AzL4) { + return true; + } + // Parser doesn't know this version yet; inspect version_id. + if matches!(rel, AzureLinuxRelease::Other) { + if let Some(major) = version_id + .and_then(|v| v.split('.').next()) + .and_then(|m| m.parse::().ok()) + { + return major >= 4; + } + } + } + false + } + pub fn is_acl(&self) -> bool { self == &Distro::AzureContainerLinux } diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index e3073aa8b6..b7d16dc3ce 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -6,7 +6,7 @@ use std::{ }; use anyhow::{bail, ensure, Context, Error}; -use log::{debug, trace}; +use log::{debug, trace, warn}; use reqwest::Url; use tempfile::{NamedTempFile, TempDir}; @@ -292,8 +292,24 @@ fn copy_file_artifacts( uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; } else { // In non-UKI mode, bail if grub_noprefix.efi is not found in the image. + // AZL4+ does not ship grub2-efi-binary-noprefix (AZL3-specific convention), + // so automatically skip this check for AZL4 and later. `is_azl4_or_later` + // handles AZL5+ correctly by re-checking version_id when the parser + // falls back to AzureLinuxRelease::Other. + // TODO: Two sources of truth for "noprefix not required" exist now: + // - this distro check + // - the filesystem probe in generate_boot_filepaths + // The probe is authoritative. Consider folding the check into the + // probe result (e.g. ensure! that *some* grub binary was found, + // not specifically the noprefix variant) in a follow-up. See + // 2026-05-18 PR-2 deep-review.md. + let image_os_release = ctx.image_os_release(); + let is_azl4_or_later = image_os_release + .get_distro() + .is_azl4_or_later(image_os_release.version_id.as_deref()); ensure!( grub_noprefix + || is_azl4_or_later || ctx .spec .internal_params @@ -605,6 +621,69 @@ fn copy_boot_files( Ok(no_prefix) } +/// Search EFI vendor directories for a specific binary. +/// +/// UEFI convention: each OS vendor installs its bootloader under +/// `EFI//` (e.g., `EFI/fedora/`, `EFI/azurelinux/`). +/// This function searches all subdirectories of the EFI directory +/// for the specified binary, skipping the BOOT fallback directory. +/// +/// Vendor dirs are iterated in sorted (lexicographic) order so the +/// selection is reproducible across builds when more than one vendor +/// directory contains a candidate. `read_dir` order alone is +/// filesystem-dependent (ext4 returns hash order, FAT returns +/// directory-entry order), which would produce irreproducible ESP +/// images on cross-builds and break attestation/PCR lock for the +/// selected bootloader. +fn find_efi_binary_in_vendor_dirs(efi_dir: &Path, binary_name: &str) -> Option { + let entries = match std::fs::read_dir(efi_dir) { + Ok(e) => e, + Err(e) => { + debug!("Cannot read EFI directory '{}': {}", efi_dir.display(), e); + return None; + } + }; + + // Materialize entries first so we can sort, and so a per-entry + // iterator error is logged instead of silently dropped. + let mut paths: Vec = Vec::new(); + for entry in entries { + match entry { + Ok(e) => paths.push(e.path()), + Err(e) => warn!( + "Failed to read entry under EFI directory '{}': {}", + efi_dir.display(), + e + ), + } + } + paths.sort(); + + for path in paths { + if !path.is_dir() { + continue; + } + + // Skip the BOOT directory (already checked by the caller) + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.eq_ignore_ascii_case("BOOT") { + continue; + } + } + + let candidate = path.join(binary_name); + if candidate.exists() && candidate.is_file() { + debug!( + "Found GRUB EFI executable in vendor directory: '{}'", + candidate.display() + ); + return Some(candidate); + } + } + + None +} + /// Generates a list of filepaths to the boot files that need to be copied to implement file-based /// update of ESP, relative to the mounted directory. /// @@ -642,24 +721,35 @@ fn generate_boot_filepaths(temp_mount_dir: &Path, is_uki: bool) -> Result Date: Wed, 3 Jun 2026 15:14:01 -0700 Subject: [PATCH 04/42] engineering: Clean up ESP noprefix check and grub search comments - Remove redundant ensure!(grub_noprefix) check from ESP setup. generate_boot_filepaths() already finds a working GRUB binary (noprefix, standard, or vendor-dir). The separate policy check was redundant. - Simplify copy_boot_files to return () instead of bool - Attribute grub search format variants to distro conventions (AZL3/Mariner vs AZL4/Fedora), not MIC internals - Update mixed-forms test comment to reference cross-version A/B update scenario Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 14 ++++--- crates/trident/src/subsystems/esp.rs | 58 ++++++---------------------- 2 files changed, 20 insertions(+), 52 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index dea58f2dd4..c97183616c 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -236,9 +236,9 @@ impl GrubConfig { /// /// 1. The upstream legacy form: `search -n -u -s` /// 2. AZL3 / standard form: `search --no-floppy --fs-uuid --set=root ` - /// 3. AZL4 MIC-generated form: `search --fs-uuid --set=root ` - /// (the `--no-floppy` option is redundant on EFI machines, so AZL4's - /// grub stub omits it.) + /// 3. AZL4 / Fedora-based form: `search --fs-uuid --set=root ` + /// (`--no-floppy` is a Mariner-specific convention; Fedora's grub2 + /// scripts don't emit it, and it's redundant on EFI machines.) /// /// We rewrite *every* matching line with the corresponding form so that /// stubs containing more than one variant (rare but possible during @@ -1006,7 +1006,7 @@ mod tests { #[test] fn test_update_search_azl4_form() { - // AZL4 MIC-generated stubs omit --no-floppy. + // AZL4 (Fedora-based) stubs omit --no-floppy. let mut grub_config = GrubConfig { path: PathBuf::new(), contents: indoc::indoc! { r#" @@ -1030,8 +1030,10 @@ mod tests { #[test] fn test_update_search_mixed_forms() { - // If both AZL3 and AZL4 forms appear (e.g. an image whose stub - // includes vendored fragments), both must be rewritten. + // Validates that all three regex paths fire independently. While a + // single grub stub typically contains one search form, cross-version + // A/B updates (e.g. AZL3->AZL4) may leave different formats across + // the boot and ESP grub configs over the machine's lifecycle. let mut grub_config = GrubConfig { path: PathBuf::new(), contents: indoc::indoc! { r#" diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index b7d16dc3ce..1c81d4187c 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -5,7 +5,7 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{bail, ensure, Context, Error}; +use anyhow::{bail, Context, Error}; use log::{debug, trace, warn}; use reqwest::Url; use tempfile::{NamedTempFile, TempDir}; @@ -19,7 +19,7 @@ use osutils::{ use trident_api::{ config::UefiFallbackMode, constants::{ - internal_params::{DISABLE_GRUB_NOPREFIX_CHECK, RAW_COSI_STORAGE}, + internal_params::RAW_COSI_STORAGE, EFI_DEFAULT_BIN_DIRECTORY, EFI_DEFAULT_BIN_RELATIVE_PATH, ESP_EFI_DIRECTORY, GRUB2_CONFIG_FILENAME, GRUB2_CONFIG_RELATIVE_PATH, }, @@ -277,12 +277,11 @@ fn copy_file_artifacts( } // Call helper func to copy boot files from temp_mount_dir to esp_dir_path - let grub_noprefix = - copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( - "Failed to copy boot files from directory {} to directory {}", - temp_mount_dir.display(), - esp_dir_path.display() - ))?; + copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( + "Failed to copy boot files from directory {} to directory {}", + temp_mount_dir.display(), + esp_dir_path.display() + ))?; if ctx.is_uki().unstructured("UKI setting unknown")? { // Prepare ESP directory structure for UKI boot @@ -291,32 +290,8 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; } else { - // In non-UKI mode, bail if grub_noprefix.efi is not found in the image. - // AZL4+ does not ship grub2-efi-binary-noprefix (AZL3-specific convention), - // so automatically skip this check for AZL4 and later. `is_azl4_or_later` - // handles AZL5+ correctly by re-checking version_id when the parser - // falls back to AzureLinuxRelease::Other. - // TODO: Two sources of truth for "noprefix not required" exist now: - // - this distro check - // - the filesystem probe in generate_boot_filepaths - // The probe is authoritative. Consider folding the check into the - // probe result (e.g. ensure! that *some* grub binary was found, - // not specifically the noprefix variant) in a follow-up. See - // 2026-05-18 PR-2 deep-review.md. - let image_os_release = ctx.image_os_release(); - let is_azl4_or_later = image_os_release - .get_distro() - .is_azl4_or_later(image_os_release.version_id.as_deref()); - ensure!( - grub_noprefix - || is_azl4_or_later - || ctx - .spec - .internal_params - .get_flag(DISABLE_GRUB_NOPREFIX_CHECK), - "Cannot locate {GRUB_NOPREFIX_EFI} in the boot image. \ - Verify if the grub2-efi-binary-noprefix package was installed on the booted image.", - ); + // generate_boot_filepaths already found a working GRUB binary + // (noprefix, standard, or vendor-dir). No further check needed. } Ok(()) @@ -573,9 +548,7 @@ fn copy_boot_files( temp_mount_dir: &Path, esp_dir: &Path, boot_files: Vec, -) -> Result { - // Track whether grub-noprefix.efi is used - let mut no_prefix = false; +) -> Result<(), Error> { // Copy the specified files from temp_mount_path to esp_dir_path for boot_file in boot_files.iter() { let source_path = temp_mount_dir.join(boot_file); @@ -614,11 +587,10 @@ fn copy_boot_files( .context("Failed to convert path to string")?, ) .context("Failed to rename grub-noprefix efi")?; - no_prefix = true; } } - Ok(no_prefix) + Ok(()) } /// Search EFI vendor directories for a specific binary. @@ -1406,13 +1378,7 @@ mod tests { // Call helper func to create mock boot files in temp_mount_dir create_boot_files(temp_mount_dir.path(), &file_names, "test-content"); // Call helper func to copy boot files from temp_mount_dir to esp_dir - let noprefix = - copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); - - assert!( - noprefix, - "grub-noprefix.efi is in the list of files, so it should be detected" - ); + copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); for file_name in file_names.clone() { // Create full path of source_path From bb2fd89905638529632c44c39c6157073252113c Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 15:17:21 -0700 Subject: [PATCH 05/42] engineering: Remove unused is_azl4_or_later helper No callers remain after the noprefix check removal. Can be re-added if a future change needs version-range gating. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/osrelease.rs | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/crates/osutils/src/osrelease.rs b/crates/osutils/src/osrelease.rs index 5d8caafe29..c39981c6f7 100644 --- a/crates/osutils/src/osrelease.rs +++ b/crates/osutils/src/osrelease.rs @@ -353,37 +353,6 @@ impl Distro { self == &Distro::AzureLinux(AzureLinuxRelease::AzL4) } - /// Returns true for AZL4 and any later Azure Linux release. - /// - /// Use this when gating behavior on features that landed in AZL4 and - /// are expected to remain present in subsequent major releases (e.g. - /// AZL4 dropped the `grub2-efi-binary-noprefix` packaging convention; - /// AZL5+ is expected to keep that change). Strict `is_azl4()` would - /// silently regress to the AZL3 code path when AZL5 ships. - /// - /// The decision is based on the `AzureLinuxRelease` ordering AND, for - /// versions newer than what the parser recognizes, the numeric major - /// component of `version_id`. New major releases that the parser - /// hasn't been taught yet will fall through to `AzureLinuxRelease::Other`, - /// so we re-check `version_id` directly. - pub fn is_azl4_or_later(&self, version_id: Option<&str>) -> bool { - if let Distro::AzureLinux(rel) = self { - if matches!(rel, AzureLinuxRelease::AzL4) { - return true; - } - // Parser doesn't know this version yet; inspect version_id. - if matches!(rel, AzureLinuxRelease::Other) { - if let Some(major) = version_id - .and_then(|v| v.split('.').next()) - .and_then(|m| m.parse::().ok()) - { - return major >= 4; - } - } - } - false - } - pub fn is_acl(&self) -> bool { self == &Distro::AzureContainerLinux } From 2411dd9f644c95fc8686e0094d20d2f1ae7dd90f Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:16:00 -0700 Subject: [PATCH 06/42] engineering: Restore AZL3 noprefix guard as distro-specific check AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative config lookup) and grub2-efi-binary-noprefix (root-device-relative lookup). Trident's A/B update path requires the noprefix variant on AZL3. Restore the noprefix check, but scope it to AZL3 only using image_distro().is_azl3(). AZL4+ uses standard grubx64.efi in vendor directories and does not need noprefix. This replaces the previous generic ensure! + DISABLE_GRUB_NOPREFIX_CHECK flag with a targeted distro check. No escape hatch needed since the check only fires for AZL3. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 38 ++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index 1c81d4187c..e0d7afc0a0 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -277,11 +277,12 @@ fn copy_file_artifacts( } // Call helper func to copy boot files from temp_mount_dir to esp_dir_path - copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( - "Failed to copy boot files from directory {} to directory {}", - temp_mount_dir.display(), - esp_dir_path.display() - ))?; + let used_noprefix = + copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( + "Failed to copy boot files from directory {} to directory {}", + temp_mount_dir.display(), + esp_dir_path.display() + ))?; if ctx.is_uki().unstructured("UKI setting unknown")? { // Prepare ESP directory structure for UKI boot @@ -289,9 +290,16 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; - } else { - // generate_boot_filepaths already found a working GRUB binary - // (noprefix, standard, or vendor-dir). No further check needed. + } else if ctx.image_distro().is_azl3() && !used_noprefix { + // AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative + // config lookup) and grub2-efi-binary-noprefix (root-device-relative + // config lookup). Trident's A/B update path requires the noprefix + // variant. If the image shipped the wrong one, fail early rather + // than producing an unbootable machine. + bail!( + "AZL3 image does not contain {GRUB_NOPREFIX_EFI}. \ + Trident requires the grub2-efi-binary-noprefix package on AZL3." + ); } Ok(()) @@ -548,7 +556,8 @@ fn copy_boot_files( temp_mount_dir: &Path, esp_dir: &Path, boot_files: Vec, -) -> Result<(), Error> { +) -> Result { + let mut used_noprefix = false; // Copy the specified files from temp_mount_path to esp_dir_path for boot_file in boot_files.iter() { let source_path = temp_mount_dir.join(boot_file); @@ -587,10 +596,11 @@ fn copy_boot_files( .context("Failed to convert path to string")?, ) .context("Failed to rename grub-noprefix efi")?; + used_noprefix = true; } } - Ok(()) + Ok(used_noprefix) } /// Search EFI vendor directories for a specific binary. @@ -1378,7 +1388,13 @@ mod tests { // Call helper func to create mock boot files in temp_mount_dir create_boot_files(temp_mount_dir.path(), &file_names, "test-content"); // Call helper func to copy boot files from temp_mount_dir to esp_dir - copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); + let used_noprefix = + copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); + + assert!( + used_noprefix, + "grub-noprefix.efi is in the list of files, so it should be detected" + ); for file_name in file_names.clone() { // Create full path of source_path From d5846c21aa7632df10560da32a9e07ba36212a34 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:22:31 -0700 Subject: [PATCH 07/42] fix: Restore grub_noprefix name and DISABLE_GRUB_NOPREFIX_CHECK flag Keep the original variable name and preserve the operator escape hatch. Minimize diff from upstream. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index e0d7afc0a0..8bd900bb11 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -19,7 +19,7 @@ use osutils::{ use trident_api::{ config::UefiFallbackMode, constants::{ - internal_params::RAW_COSI_STORAGE, + internal_params::{DISABLE_GRUB_NOPREFIX_CHECK, RAW_COSI_STORAGE}, EFI_DEFAULT_BIN_DIRECTORY, EFI_DEFAULT_BIN_RELATIVE_PATH, ESP_EFI_DIRECTORY, GRUB2_CONFIG_FILENAME, GRUB2_CONFIG_RELATIVE_PATH, }, @@ -277,7 +277,7 @@ fn copy_file_artifacts( } // Call helper func to copy boot files from temp_mount_dir to esp_dir_path - let used_noprefix = + let grub_noprefix = copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( "Failed to copy boot files from directory {} to directory {}", temp_mount_dir.display(), @@ -290,7 +290,10 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; - } else if ctx.image_distro().is_azl3() && !used_noprefix { + } else if ctx.image_distro().is_azl3() + && !grub_noprefix + && !ctx.spec.internal_params.get_flag(DISABLE_GRUB_NOPREFIX_CHECK) + { // AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative // config lookup) and grub2-efi-binary-noprefix (root-device-relative // config lookup). Trident's A/B update path requires the noprefix @@ -557,7 +560,7 @@ fn copy_boot_files( esp_dir: &Path, boot_files: Vec, ) -> Result { - let mut used_noprefix = false; + let mut no_prefix = false; // Copy the specified files from temp_mount_path to esp_dir_path for boot_file in boot_files.iter() { let source_path = temp_mount_dir.join(boot_file); @@ -596,11 +599,11 @@ fn copy_boot_files( .context("Failed to convert path to string")?, ) .context("Failed to rename grub-noprefix efi")?; - used_noprefix = true; + no_prefix = true; } } - Ok(used_noprefix) + Ok(no_prefix) } /// Search EFI vendor directories for a specific binary. From 5ad0c6a3dc97fb9db1b557183bd973d169ee0377 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:46:25 -0700 Subject: [PATCH 08/42] fix: Use ensure! instead of bail for noprefix check Keep the same macro as upstream to minimize diff. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index 8bd900bb11..ae90c8512b 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -5,7 +5,7 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{bail, Context, Error}; +use anyhow::{bail, ensure, Context, Error}; use log::{debug, trace, warn}; use reqwest::Url; use tempfile::{NamedTempFile, TempDir}; @@ -290,18 +290,20 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; - } else if ctx.image_distro().is_azl3() - && !grub_noprefix - && !ctx.spec.internal_params.get_flag(DISABLE_GRUB_NOPREFIX_CHECK) - { + } else if ctx.image_distro().is_azl3() { // AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative // config lookup) and grub2-efi-binary-noprefix (root-device-relative // config lookup). Trident's A/B update path requires the noprefix // variant. If the image shipped the wrong one, fail early rather // than producing an unbootable machine. - bail!( - "AZL3 image does not contain {GRUB_NOPREFIX_EFI}. \ - Trident requires the grub2-efi-binary-noprefix package on AZL3." + ensure!( + grub_noprefix + || ctx + .spec + .internal_params + .get_flag(DISABLE_GRUB_NOPREFIX_CHECK), + "Cannot locate {GRUB_NOPREFIX_EFI} in the boot image. \ + Verify if the grub2-efi-binary-noprefix package was installed on the booted image.", ); } From 74ead34bc49c17544726c0982e3c845c46950fee Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:48:57 -0700 Subject: [PATCH 09/42] fix: Revert replace_all back to replace in update_search Keep the original if/else if chain with replace (first match). No real-world grub config has multiple search lines. Minimizes diff from upstream. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index c97183616c..f55476b82e 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -239,43 +239,30 @@ impl GrubConfig { /// 3. AZL4 / Fedora-based form: `search --fs-uuid --set=root ` /// (`--no-floppy` is a Mariner-specific convention; Fedora's grub2 /// scripts don't emit it, and it's redundant on EFI machines.) - /// - /// We rewrite *every* matching line with the corresponding form so that - /// stubs containing more than one variant (rare but possible during - /// distribution transitions) all get the new UUID. We bail only if no - /// regex matched any line. pub fn update_search(&mut self, uuid: &Uuid) -> Result<(), Error> { let re = Regex::new(r"(?m)^(\s*)search -n -u [\w-]+ -s$").unwrap(); let re2 = Regex::new(r"(?m)^(\s*)search --no-floppy --fs-uuid --set=root [\w-]+$").unwrap(); let re3 = Regex::new(r"(?m)^(\s*)search --fs-uuid --set=root [\w-]+$").unwrap(); - let mut matched = false; if re.is_match(&self.contents) { self.contents = re - .replace_all(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) + .replace(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) .to_string(); - matched = true; - } - if re2.is_match(&self.contents) { + } else if re2.is_match(&self.contents) { self.contents = re2 - .replace_all( + .replace( &self.contents, &format!("${{1}}search --no-floppy --fs-uuid --set=root {uuid}"), ) .to_string(); - matched = true; - } - if re3.is_match(&self.contents) { + } else if re3.is_match(&self.contents) { self.contents = re3 - .replace_all( + .replace( &self.contents, &format!("${{1}}search --fs-uuid --set=root {uuid}"), ) .to_string(); - matched = true; - } - - if !matched { + } else { bail!( "Unable to find search command in '{}'", &self.path.display() From ed333bf91e76ad1a8fc955ae3221a2e521b4bd4c Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:52:52 -0700 Subject: [PATCH 10/42] fix: Restore original test variable name noprefix Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index ae90c8512b..1ba98ea410 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -1393,11 +1393,11 @@ mod tests { // Call helper func to create mock boot files in temp_mount_dir create_boot_files(temp_mount_dir.path(), &file_names, "test-content"); // Call helper func to copy boot files from temp_mount_dir to esp_dir - let used_noprefix = + let noprefix = copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); assert!( - used_noprefix, + noprefix, "grub-noprefix.efi is in the list of files, so it should be detected" ); From 550ff11ba90a5876bdbf4443f983b1249df4f806 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 17:52:24 -0700 Subject: [PATCH 11/42] fix: Remove mixed-forms test incompatible with if/else if chain Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index f55476b82e..352064bee5 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -1015,34 +1015,6 @@ mod tests { assert!(!grub_config.contents.contains("--no-floppy")); } - #[test] - fn test_update_search_mixed_forms() { - // Validates that all three regex paths fire independently. While a - // single grub stub typically contains one search form, cross-version - // A/B updates (e.g. AZL3->AZL4) may leave different formats across - // the boot and ESP grub configs over the machine's lifecycle. - let mut grub_config = GrubConfig { - path: PathBuf::new(), - contents: indoc::indoc! { r#" - search --no-floppy --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc - search --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc - "# } - .to_owned(), - linux_command_line: None, - }; - - let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); - grub_config.update_search(&new_uuid).unwrap(); - - assert!(!grub_config.contents.contains("oldoldold")); - assert!(grub_config.contents.contains(&format!( - "search --no-floppy --fs-uuid --set=root {new_uuid}" - ))); - assert!(grub_config - .contents - .contains(&format!("search --fs-uuid --set=root {new_uuid}"))); - } - #[test] fn test_update_rootdevice() { // Define original GRUB config contents on target machine From afb7a2679878eb96590b7ad6ca2c826dc8ab22a5 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Fri, 5 Jun 2026 11:40:50 -0700 Subject: [PATCH 12/42] engineering: Add BLS entry support for grub boot arg extraction AZL4 (Fedora-based) uses Boot Loader Spec entries instead of inline linux commands in grub.cfg. When grub.cfg contains blscfg and no inline linux lines, fall back to reading boot args from /boot/loader/entries/*.conf. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osmodifier/src/grub_cfg.rs | 189 +++++++++++++++++++++++++++++- 1 file changed, 188 insertions(+), 1 deletion(-) diff --git a/crates/osmodifier/src/grub_cfg.rs b/crates/osmodifier/src/grub_cfg.rs index ade45dca97..4cd48dfbd7 100644 --- a/crates/osmodifier/src/grub_cfg.rs +++ b/crates/osmodifier/src/grub_cfg.rs @@ -18,6 +18,10 @@ use crate::OsModifierContext; /// Possible grub.cfg locations, tried in order. const GRUB_CFG_PATHS: &[&str] = &["/boot/grub2/grub.cfg", "/boot/grub/grub.cfg"]; +/// BLS (Boot Loader Spec) entry directory. Fedora-based distros (including +/// AZL4) store kernel boot entries here instead of inline in grub.cfg. +const BLS_ENTRIES_DIR: &str = "/boot/loader/entries"; + /// Extract boot arguments from the generated grub.cfg. /// /// Returns a tuple of (args_to_sync, optional_root_device). @@ -37,7 +41,14 @@ pub fn extract_boot_args_from_grub_cfg( // Find the non-recovery linux command lines. // Go expects exactly one; error otherwise. - let linux_lines = find_non_recovery_linux_lines(&content)?; + let linux_lines = match find_non_recovery_linux_lines(&content) { + Ok(lines) => lines, + Err(_) if content.contains("blscfg") => { + debug!("grub.cfg uses BLS (blscfg); reading boot args from BLS entries"); + extract_options_from_bls_entries(ctx)? + } + Err(e) => return Err(e), + }; if linux_lines.len() != 1 { bail!( "expected 1 non-recovery linux line, found {}", @@ -94,6 +105,58 @@ fn find_grub_cfg(ctx: &OsModifierContext) -> Result { bail!("Could not find grub.cfg at any of: {:?}", GRUB_CFG_PATHS) } +/// Read boot arguments from BLS (Boot Loader Spec) entries. +/// +/// Scans `{root}/boot/loader/entries/*.conf`, skips entries whose title +/// contains "rescue" or "recovery" (case-insensitive), and returns the +/// `options` line from the first valid entry (sorted lexically, matching +/// grub's ordering). +fn extract_options_from_bls_entries(ctx: &OsModifierContext) -> Result, Error> { + let entries_dir = ctx.path(BLS_ENTRIES_DIR); + let mut conf_files: Vec = fs::read_dir(&entries_dir) + .with_context(|| format!("Failed to read BLS entries dir '{}'", entries_dir.display()))? + .filter_map(|e| e.ok()) + .map(|e| e.path()) + .filter(|p| p.extension().map_or(false, |ext| ext == "conf")) + .collect(); + + conf_files.sort(); + + for conf_path in &conf_files { + let content = fs::read_to_string(conf_path) + .with_context(|| format!("Failed to read BLS entry '{}'", conf_path.display()))?; + + let mut title = None; + let mut options = None; + + for line in content.lines() { + if let Some(value) = line.strip_prefix("title") { + title = Some(value.trim().to_string()); + } else if let Some(value) = line.strip_prefix("options") { + options = Some(value.trim().to_string()); + } + } + + // Skip recovery/rescue entries. + if let Some(ref t) = title { + let lower = t.to_lowercase(); + if lower.contains("rescue") || lower.contains("recovery") { + trace!("Skipping BLS rescue/recovery entry: {}", conf_path.display()); + continue; + } + } + + if let Some(opts) = options { + debug!("Using BLS entry '{}': options = {opts}", conf_path.display()); + // Return as a synthetic "linux" line: prepend a dummy kernel path + // so the downstream parser (which skips the first token) works. + return Ok(vec![format!("/boot/vmlinuz {opts}")]); + } + } + + bail!("no non-recovery BLS entry found in '{}'", entries_dir.display()) +} + /// Return the first whitespace-delimited word from a line, or None if the /// line is empty / whitespace-only. fn first_word(line: &str) -> Option<&str> { @@ -757,4 +820,128 @@ mod tests { assert_eq!(count_braces("menuentry 'title {x}' {"), (1, 0)); assert_eq!(count_braces(r#"menuentry "title {x}" {"#), (1, 0)); } + + // ======================= BLS entry support ======================= + + #[test] + fn test_extract_bls_fallback() { + let tmp = tempdir().unwrap(); + + // Write a BLS-style grub.cfg (contains blscfg, no inline linux lines) + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write( + grub_dir.join("grub.cfg"), + indoc::indoc! {r#" + set timeout=5 + load_env -f /boot/grub2/grubenv + blscfg + "#}, + ) + .unwrap(); + + // Write a BLS entry + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + std::fs::write( + bls_dir.join("azl4.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 (6.6.60) + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro selinux=1 rd.overlayfs=lower,upper,work,/dev/sda5 + "#}, + ) + .unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let (args, root_device) = extract_boot_args_from_grub_cfg(&ctx).unwrap(); + assert_eq!(root_device, Some("/dev/sda2".to_string())); + assert!(args.contains(&"selinux=1".to_string())); + } + + #[test] + fn test_extract_bls_skips_recovery() { + let tmp = tempdir().unwrap(); + + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write( + grub_dir.join("grub.cfg"), + "set timeout=5\nblscfg\n", + ) + .unwrap(); + + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + + // Rescue entry (should be skipped) + std::fs::write( + bls_dir.join("rescue.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 rescue + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro single + "#}, + ) + .unwrap(); + + // Normal entry (should be used) + std::fs::write( + bls_dir.join("zzz-normal.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 (6.6.60) + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro selinux=1 + "#}, + ) + .unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let (args, root_device) = extract_boot_args_from_grub_cfg(&ctx).unwrap(); + assert_eq!(root_device, Some("/dev/sda2".to_string())); + assert!(args.contains(&"selinux=1".to_string())); + // "single" from rescue entry should NOT appear + assert!(!args.iter().any(|a| a.contains("single"))); + } + + #[test] + fn test_extract_bls_no_entries() { + let tmp = tempdir().unwrap(); + + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write( + grub_dir.join("grub.cfg"), + "set timeout=5\nblscfg\n", + ) + .unwrap(); + + // Empty BLS entries dir + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let result = extract_boot_args_from_grub_cfg(&ctx); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("no non-recovery BLS entry found"), + "Error should mention no BLS entries, got: {err_msg}" + ); + } } From 75f8095a471428ab2998ca7cb2b370498b5c0014 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Fri, 5 Jun 2026 12:37:00 -0700 Subject: [PATCH 13/42] fix: Apply rustfmt to BLS support code Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osmodifier/src/grub_cfg.rs | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/crates/osmodifier/src/grub_cfg.rs b/crates/osmodifier/src/grub_cfg.rs index 4cd48dfbd7..19a9f3bd22 100644 --- a/crates/osmodifier/src/grub_cfg.rs +++ b/crates/osmodifier/src/grub_cfg.rs @@ -141,20 +141,29 @@ fn extract_options_from_bls_entries(ctx: &OsModifierContext) -> Result Date: Tue, 2 Jun 2026 17:40:14 -0700 Subject: [PATCH 14/42] infra: Add AZL4 builder infrastructure and image acquisition Adds AZL4 build pipeline stages with MCR-hosted MIC container, BlobImageManifest class for ACG blob source downloads, and service connection runbook. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitattributes | 22 ++ .gitignore | 5 +- .../stages/build_image/build-image-azl4.yml | 79 ++++++ .../build_image/build-image-template-azl4.yml | 165 +++++++++++++ tests/images/SERVICE-CONNECTION-RUNBOOK.md | 225 ++++++++++++++++++ tests/images/builder/__init__.py | 41 +++- tests/images/builder/cli.py | 22 +- tests/images/builder/download.py | 145 ++++++++++- tests/images/builder/run.py | 14 +- tests/images/testimages.py | 59 +++++ 10 files changed, 769 insertions(+), 8 deletions(-) create mode 100644 .gitattributes create mode 100644 .pipelines/templates/stages/build_image/build-image-azl4.yml create mode 100644 .pipelines/templates/stages/build_image/build-image-template-azl4.yml create mode 100644 tests/images/SERVICE-CONNECTION-RUNBOOK.md diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..0a680fcc46 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,22 @@ +* text=auto eol=lf + +# Anything that gets executed inside an image must keep LF endings; CRLF +# on shebang lines breaks the interpreter lookup with `bad interpreter: +# /bin/bash^M`. +*.sh text eol=lf +*.py text eol=lf +*.service text eol=lf +*.network text eol=lf +*.yaml text eol=lf +*.yml text eol=lf + +# Binary artifacts — never normalize. +*.vhdx binary +*.cosi binary +*.qcow2 binary +*.iso binary +*.raw binary +*.png binary +*.jpg binary +*.zst binary +*.patch text eol=lf diff --git a/.gitignore b/.gitignore index e7d3febb7d..a8fd852362 100644 --- a/.gitignore +++ b/.gitignore @@ -366,4 +366,7 @@ vendor/ # Virtdeploy files /tools/vm-netlaunch.yaml -/tools/virt-deploy-metadata.json \ No newline at end of file +/tools/virt-deploy-metadata.json +# AZL4 trident binary baked into test image (built locally) +tests/images/trident-vm-testimage/base/trident-bin/ +tests/images/trident-vm-testimage/base/osmodifier-bin/ diff --git a/.pipelines/templates/stages/build_image/build-image-azl4.yml b/.pipelines/templates/stages/build_image/build-image-azl4.yml new file mode 100644 index 0000000000..0fae10eb29 --- /dev/null +++ b/.pipelines/templates/stages/build_image/build-image-azl4.yml @@ -0,0 +1,79 @@ +# AZL4 variant of build-image.yml. +# +# Forked from build-image.yml on 2026-05-13. Calls build-image-template-azl4.yml +# (which uses MCR MIC container + blob-sourced base VHDX) instead of the +# external test-images repo template. +# +# TODO(azl4-merge-back): Merge this back into build-image.yml with an +# `azureLinuxVersion` parameter switch once AZL4 has feed-published base VHDXes +# and RPMs. + +parameters: + - name: imageName + type: string + + - name: clones + displayName: "Number of clones to generate" + type: number + default: 2 + + - name: dependsOnTrident + type: boolean + default: true + + - name: dependsOnStage + type: string + default: "" + +stages: + - stage: TridentTestImg_${{ replace(parameters.imageName, '-', '_') }} + displayName: Build ${{ parameters.imageName }} + ${{ if parameters.dependsOnTrident }}: + dependsOn: + # AZL4 doesn't have RPM publication so we depend on the + # trident-binaries artifact (which the GetTridentBinaries stage + # produces and copies to artifacts/binaries/trident). + - GetTridentBinaries_rpms_amd64 + # PrepareSSHKeys produces the shared 'ssh-keys' artifact. + # build-image-template-azl4.yml stages it into the testimage + # tree so qcow2 + cosi builds share the same SSH keypair, + # which lets storm-trident SSH into both A/B sides after + # update. + - PrepareSSHKeys + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + ${{ elseif ne(parameters.dependsOnStage, '') }}: + dependsOn: + - PrepareSSHKeys + - ${{ parameters.dependsOnStage }} + + jobs: + - job: BuildTridentTestImgAzl4 + displayName: Build (AZL4 MIC) + # Pinned MIC container build adds ~5 min cold-cache. Bump the timeout + # accordingly. TODO(azl4-release): lower back to 20 min once we use a + # released MIC container. + timeoutInMinutes: 30 + pool: + type: linux + + variables: + ob_outputDirectory: /tmp/output + ob_artifactBaseName: ${{ parameters.imageName }} + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + inputs: + buildType: current + artifactName: trident-binaries + targetPath: "$(Build.ArtifactStagingDirectory)/trident-binaries" + displayName: Download Trident binaries + condition: eq('${{ parameters.dependsOnTrident }}', true) + + - template: build-image-template-azl4.yml + parameters: + tridentSourceDirectory: $(TRIDENT_SOURCE_DIR) + imageName: ${{ parameters.imageName }} + clones: ${{ parameters.clones }} diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml new file mode 100644 index 0000000000..77f26a7c41 --- /dev/null +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -0,0 +1,165 @@ +# AZL4 variant of build-image-template.yml. +# +# Forked from build-image-template.yml on 2026-05-13. The AZL3 path pulls the +# base VHDX from the AzureLinuxArtifacts ADO feed and the Trident RPM from the +# trident-binaries pipeline artifact, then runs `testimages.py build`. None of +# that works for AZL4 today because: +# +# 1. There is no AzureLinuxArtifacts feed entry for AZL4 base VHDX. We +# download from the AZL preview gallery's backing storage account +# (azlpubdev2mruiyvi/images-dev) instead. See the BlobImageManifest +# registration in tests/images/testimages.py. +# +# 2. There is no Trident RPM for AZL4. The binary is baked in via +# additionalFiles in tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml. +# +# TODO(azl4-merge-back): When AZL4 has feed-published base VHDXes and RPMs, +# fold this template back into build-image-template.yml by adding a +# `azureLinuxVersion: "4.0"` branch. + +parameters: + - name: tridentSourceDirectory + type: string + + - name: imageName + type: string + + - name: clones + type: number + default: 1 + displayName: Number of clones to create + + # The AZL4 base VHDX is sourced from the Azure Linux preview gallery's + # backing storage account. The pipeline service connection at + # $(BLOB_SERVICE_CONNECTION) must have `Storage Blob Data Reader` on + # this account. See tests/images/SERVICE-CONNECTION-RUNBOOK.md. + - name: blobStorageAccount + type: string + default: "azlpubdev2mruiyvi" + + - name: blobContainer + type: string + default: "images-dev" + + - name: blobSubscription + type: string + # Subscription where the storage account lives. The SC's default + # subscription may differ — we explicitly set context before download. + default: "e4ab81f8-030f-4593-a8f2-3ea2c7630a19" + + - name: blobServiceConnection + type: string + # NB: this must be a service connection that exists in the ADO project. + # Trident infra needs to create it manually (Karhu can't); see the PR-5 + # follow-up validation report for the runbook. + default: "trident-azl4-blob-reader" + + - name: micContainerTag + type: string + default: "imagecustomizer:1.4.0-1" + +steps: + - template: ../common_tasks/avoid-pypi-usage.yml + + - template: common/sfi-enforce-isolation-with-etc-hosts.yaml@platform-pipelines + + # Stage the Trident binary that gets baked into the COSI via additionalFiles. + # The trident-binaries artifact comes from the same upstream Trident build + # stage the AZL3 path uses; we just copy the binary rather than installing + # an RPM. + # + # TODO(azl4-rpm): replace this binary copy with an RPM install once the + # trident-service RPM is packaged for AZL4 (same TODO as in + # tests/images/testimages.py registration). + - bash: | + set -euxo pipefail + TRIDENT_BIN_SRC="$(Build.ArtifactStagingDirectory)/trident-binaries" + TRIDENT_BIN_DEST="${{ parameters.tridentSourceDirectory }}/tests/images/trident-vm-testimage/base/trident-bin" + + if [ ! -f "$TRIDENT_BIN_SRC/trident" ]; then + echo "trident binary not found at $TRIDENT_BIN_SRC/trident" + echo "Available artifacts:" + find "$TRIDENT_BIN_SRC" -type f 2>/dev/null | head -20 || true + exit 1 + fi + + mkdir -p "$TRIDENT_BIN_DEST" + cp "$TRIDENT_BIN_SRC/trident" "$TRIDENT_BIN_DEST/trident" + chmod +x "$TRIDENT_BIN_DEST/trident" + file "$TRIDENT_BIN_DEST/trident" + displayName: "Stage Trident binary into testimage tree" + workingDirectory: ${{ parameters.tridentSourceDirectory }} + + # Pull the released MIC container from MCR. AZL4 support is included + # in imagecustomizer >= 1.4.0. + - bash: | + set -euxo pipefail + docker pull "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" + displayName: "Pull MIC container from MCR" + + # Stage the pipeline-wide SSH key into the testimage tree before + # MIC runs. testimages.py's generate_ssh_keys() generates a new + # keypair UNLESS files/id_rsa.pub already exists at the source path + # — in which case it reuses it. By dropping the shared key from the + # PrepareSSHKeys artifact here, both the qcow2 base build and the + # COSI build end up with the same key baked into testuser's + # authorized_keys, so storm-trident's A/B update test can SSH into + # both A-side and B-side after the update reboot. + # + # The matching private key lives at ssh-keys/id_rsa from the + # PrepareSSHKeys stage. storm-trident's rollback stage picks it up + # the same way for AZL3 builds. + - task: DownloadPipelineArtifact@2 + displayName: "Download shared SSH keys" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh-keys" + + - bash: | + set -euxo pipefail + SSH_PUB_SRC="$(Build.ArtifactStagingDirectory)/ssh-keys/id_rsa.pub" + SSH_PUB_DEST="${{ parameters.tridentSourceDirectory }}/tests/images/trident-vm-testimage/base/files/id_rsa.pub" + if [ ! -f "$SSH_PUB_SRC" ]; then + echo "shared SSH public key not found at $SSH_PUB_SRC" + find "$(Build.ArtifactStagingDirectory)/ssh-keys" -type f + exit 1 + fi + cp "$SSH_PUB_SRC" "$SSH_PUB_DEST" + echo "Staged shared SSH public key:" + cat "$SSH_PUB_DEST" + displayName: "Stage shared SSH key into testimage tree" + workingDirectory: ${{ parameters.tridentSourceDirectory }} + + # Download the AZL4 base VHDX from the preview gallery's backing storage. + # Authenticates via the federated identity attached to the service + # connection — no storage keys handled here. + # + # The SC's default subscription (Polar_ImageTools_Staging) differs from + # the storage account's subscription (ControlTower_Test). We must switch + # context so `az storage blob list` resolves the account correctly. + - task: AzureCLI@2 + displayName: "Download AZL4 base VHDX from blob" + inputs: + azureSubscription: ${{ parameters.blobServiceConnection }} + scriptType: bash + scriptLocation: inlineScript + workingDirectory: ${{ parameters.tridentSourceDirectory }} + inlineScript: | + set -euxo pipefail + az account set --subscription "${{ parameters.blobSubscription }}" + python3 ./tests/images/testimages.py download-image azl4_qemu_guest \ + --blob-storage-account "${{ parameters.blobStorageAccount }}" \ + --blob-container "${{ parameters.blobContainer }}" + ls -la artifacts/azl4_qemu_guest.vhdx + + - bash: | + set -euxo pipefail + python3 ./tests/images/testimages.py build \ + "${{ parameters.imageName }}" \ + --container "${{ parameters.micContainerTag }}" \ + --output-dir "$(ob_outputDirectory)" \ + --no-download \ + --clones ${{ parameters.clones }} + displayName: "Build ${{ parameters.imageName }}" + workingDirectory: ${{ parameters.tridentSourceDirectory }} diff --git a/tests/images/SERVICE-CONNECTION-RUNBOOK.md b/tests/images/SERVICE-CONNECTION-RUNBOOK.md new file mode 100644 index 0000000000..2a17d49d2d --- /dev/null +++ b/tests/images/SERVICE-CONNECTION-RUNBOOK.md @@ -0,0 +1,225 @@ +# ADO Service Connection Runbook — UAMI + Workload Identity Federation + +Step-by-step recipe for creating an ADO Azure Resource Manager service +connection authenticated by a User-Assigned Managed Identity (UAMI) via +Workload Identity Federation (WIF). This is the SFI-compliant pattern; no +secrets are stored anywhere. + +Adapted from Brian's wiki [Creating an ADO Service Connection authenticated +with UMI](https://dev.azure.com/mariner-org/mariner/_wiki/wikis/mariner.wiki/5697/Creating-an-ADO-Service-Connection-authenticated-with-UMI), +with the concrete commands and gotchas from setting up the +`trident-azl4-blob-reader` connection on 2026-05-14. + +## What you end up with + +``` +Azure UAMI ─(federated)→ ADO Service Connection ─(used by)→ Pipeline + │ + └─(role assignment)→ Target Azure resource +``` + +The pipeline uses `AzureCLI@2` referencing the SC. ADO mints an OIDC token, +exchanges it for an Azure access token via the UAMI's federated credential, +and the pipeline gets an `az login`'d session with the UAMI's RBAC. + +## Prerequisites + +- **Azure:** Contributor on the resource group where you'll create the UAMI +- **Azure:** User Access Administrator or Owner on the target resource you're + granting access to (for the role assignment) +- **ADO:** Project Administrator on the project where the service connection + will live + +## Step 1 — Create the UAMI (Azure CLI) + +```powershell +$sub = "" +$rg = "" +$loc = "" # match siblings if reusing an RG +$umi = "" # naming convention: see notes below + +az account set -s $sub + +# Pre-flight: confirm UAMI doesn't already exist +az identity show -g $rg -n $umi 2>$null +# (should return nothing) + +# Create +az identity create -g $rg -n $umi -l $loc ` + --tags purpose= owner= project= +``` + +The output contains `clientId` (use as ADO's Application ID later) and +`principalId` (use as the role-assignment assignee). + +### Naming convention notes + +Match what's already in the RG. Examples from +`maritimus-github-runner` (b3e01d89... sub): + +- `maritimus-github-runner-umi-*` for GitHub Actions identities +- `maritimus-github-storage-ado-*-umi` for ADO pipeline identities + +When in doubt, ask the RG owner before deviating. + +## Step 2 — Grant the UAMI access to the target resource + +For the trident-azl4-blob-reader UAMI, the target was the +`azlpubdev2mruiyvi` storage account (backing the AZL preview gallery), +with `Storage Blob Data Reader` (least privilege — we only need to read +base VHDXes). + +```powershell +$objId = az identity show -g $rg -n $umi --query principalId -o tsv +$scope = "/subscriptions/$sub/resourceGroups/$rg/providers///" + +az role assignment create ` + --assignee-object-id $objId ` + --assignee-principal-type ServicePrincipal ` + --role "" ` + --scope $scope + +# Verify +az role assignment list --assignee $objId --all -o table +``` + +**Always use least privilege.** Don't pick `Owner` when `Reader` will do. + +## Step 3 — Start service connection in ADO (do NOT click Verify yet) + +In ADO project → Project Settings → Service Connections → New service +connection. + +| Field | Value | +|---|---| +| Connection type | **Azure Resource Manager** | +| Identity type | **App registration or managed identity (manual)** | +| Credential | **Workload Identity Federation** | +| Scope Level | **Subscription** | +| Subscription ID | `` | +| Subscription Name | `` | +| **Application (client) ID** | the UAMI's **clientId** from step 1 | +| Tenant ID | `72f988bf-86f1-41af-91ab-2d7cd011db47` (MSIT) | +| Service connection name | `` | +| Grant access permission to all pipelines | **uncheck** (see SFI note below) | + +After filling these in but **before saving**, ADO shows you: + +- **Issuer URL** +- **Subject identifier** + +Both are needed for step 4. Keep this ADO tab open. + +### Issuer/Subject gotcha — read them off the form + +⚠️ Do NOT guess these values. They are not the same as `vstoken.dev.azure.com/...` +that older service connections may show. ADO assigns a new pair when you +create the SC, and the issuer is the Entra tenant authority URL +(`https://login.microsoftonline.com//v2.0`), not the ADO token +issuer URL. The subject is opaque (looks like +`/eid1/c/pub/t/.../sc/.../`). + +Copy the exact strings from the ADO form into the FIC. Do not transcribe; +copy-paste. + +## Step 4 — Add the federated credential to the UAMI + +```powershell +$issuer = "" +$subject = "" + +az identity federated-credential create ` + -g $rg ` + --identity-name $umi ` + --name "" ` + --issuer "$issuer" ` + --subject "$subject" ` + --audiences "api://AzureADTokenExchange" + +# Verify +az identity federated-credential list -g $rg --identity-name $umi -o table +``` + +FIC name should describe the consumer. For ADO connections we use +`ado--` (e.g. `ado-ecf-trident-azl4-blob-reader`). + +## Step 5 — Verify and save in ADO + +Wait ~30 seconds for Entra to propagate the FIC, then return to the ADO +form and click **Verify and save**. + +### Common errors + +**`AADSTS70025: client has no configured federated identity credentials`** +- The FIC hasn't been added yet. Run step 4. + +**`AADSTS700211: No matching federated identity record found for presented +assertion issuer 'https://login.microsoftonline.com//v2.0'`** +- The FIC exists but the issuer or subject doesn't match what ADO is + presenting. Re-read the ADO form carefully (do not transcribe — copy). +- A common mistake is reusing the issuer URL from an unrelated existing + service connection. Each new SC may get its own issuer string. + +**Verify succeeds but pipeline fails with `You do not have the required +permissions...`** +- The role assignment in step 2 either targeted the wrong scope, or + Azure RBAC hasn't propagated yet (wait up to 10 minutes). Re-check that + `az role assignment list --assignee --all` shows the role + on the correct scope. + +## Step 6 — SFI compliance — restrict pipeline permissions + +[SFI-ES2.4.11](https://eng.ms/docs/coreai/devdiv/one-engineering-system-1es/1es-docs/1es-security-configuration/azdo-config-remediation/all-pipeline-access-es-2-4-tsg) +prohibits leaving a service connection accessible to all pipelines. + +After saving: + +1. Open the new service connection in ADO +2. Click **More options (⋮) → Security** +3. Under **Pipeline permissions**, click **Restrict permission** +4. Click **+** and add each pipeline that needs the SC by ID/name. Do not + add "all pipelines." + +## When to use the manual cleanup path + +If something goes wrong mid-setup and you need to start over cleanly: + +```powershell +# Remove an FIC that pointed at the wrong issuer/subject +az identity federated-credential delete -g $rg --identity-name $umi --name "" --yes + +# Confirm no stray role assignments +az role assignment list --assignee --all -o table + +# In ADO: delete the SC via Project Settings → Service connections → ⋮ → Delete +# In Azure: only delete the UAMI itself if you're sure nothing else uses it +``` + +The UAMI does no harm by itself — it's a managed identity with role +assignments and FICs. Deleting it cascades to role assignments +automatically; FICs are removed with the parent UAMI. + +## Reference — the trident-azl4-blob-reader connection + +| Field | Value | +|---|---| +| Purpose | Read AZL4 base VHDX from the AZL preview gallery's backing storage for trident CI | +| Storage account | `azlpubdev2mruiyvi` (subscription `e4ab81f8-030f-4593-a8f2-3ea2c7630a19`, RG `azl-acg-preview-publishing`) | +| Gallery source | `azlpubDevGallery2mruiyvi / azure-linux-4-daily-x64` (same subscription/RG) | +| UAMI name | `maritimus-github-storage-ado-trident-reader-umi` | +| UAMI subscription | `b3e01d89-bd55-414f-bbb4-cdfeb2628caa` (`AzureCNMP_CNP_AzureLinux_Polar_ImageTools_Staging`) | +| UAMI resource group | `maritimus-github-runner` | +| UAMI region | `westus2` | +| UAMI clientId | `5eaafbf5-279b-4f16-b797-50bd730dcdb8` | +| UAMI principalId | `97c7c5f1-db58-4e65-8c4a-b6d614a72657` | +| Role granted | `Storage Blob Data Reader` on `azlpubdev2mruiyvi` | +| FIC name | `ado-ecf-trident-azl4-blob-reader` | +| ADO project | `mariner-org/ECF` | +| ADO SC name | `trident-azl4-blob-reader` | +| Pipelines allowed | `[GITHUB]-trident-pr-e2e`, `[GITHUB]-trident-ci`, `[GITHUB]-trident-pr-e2e-azure` | +| Created | 2026-05-14 | +| Updated | 2026-06-01 (re-scoped from `maritimusgithubstorage` to `azlpubdev2mruiyvi`) | + +When the `AzureLinuxArtifacts` ADO feed publishes AZL4 base VHDXes, +this connection can be deleted — the standard `BaseImageManifest` +download path will handle it. diff --git a/tests/images/builder/__init__.py b/tests/images/builder/__init__.py index ca82f58db9..2881fe8518 100644 --- a/tests/images/builder/__init__.py +++ b/tests/images/builder/__init__.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field, fields from enum import Enum from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union @dataclass @@ -16,6 +16,9 @@ class BaseImage(Enum): BAREMETAL = BaseImageData("baremetal", Path("artifacts/baremetal.vhdx")) CORE_SELINUX = BaseImageData("core_selinux", Path("artifacts/core_selinux.vhdx")) QEMU_GUEST = BaseImageData("qemu_guest", Path("artifacts/qemu_guest.vhdx")) + AZL4_QEMU_GUEST = BaseImageData( + "azl4_qemu_guest", Path("artifacts/azl4_qemu_guest.vhdx") + ) CORE_ARM64 = BaseImageData("core_arm64", Path("artifacts/core_arm64.vhdx")) MINIMAL = BaseImageData("minimal", Path("artifacts/minimal.vhdx")) MINIMAL_AARCH64 = BaseImageData( @@ -60,6 +63,34 @@ class BaseImageManifest: glob: str = "*.vhdx" +@dataclass +class BlobImageManifest: + """Manifest for a base image fetched from Azure Storage Blob. + + Used for distros that don't yet publish to an ADO universal artifact + feed (e.g., Azure Linux 4.0 alpha builds). The storage account name + and container are NOT baked in here -- they are supplied at + invocation time via the --blob-storage-account / --blob-container + flags (or the BLOB_STORAGE_ACCOUNT / BLOB_CONTAINER env vars) so the + pipeline can parameterize them and rotate the location without a + code change. + + Authentication is via `az` CLI logged-in identity (`--auth-mode + login`). The pipeline running this must have a federated identity + with read access to the storage account. + """ + + image: BaseImage + # Blob name prefix to search under + # (e.g. "azure-linux/core-efi-vhdx-4.0-amd64") + path_prefix: str + # Suffix the final blob name must end with. + # The downloader lists all blobs under path_prefix, filters to ones + # ending with this suffix, and picks the lexically largest (= most + # recent version) to download. + file_suffix: str = "/image.vhdx" + + class OutputFormat(Enum): BAREMETAL_IMAGE = "baremetal-image" COSI = "cosi" @@ -249,7 +280,9 @@ class ArtifactManifest: customizer_version: str customizer_container: str customizer_container_full: str = None - base_images: List[BaseImageManifest] = field(default_factory=list) + base_images: List[Union["BaseImageManifest", "BlobImageManifest"]] = field( + default_factory=list + ) def __post_init__(self): if self.customizer_container_full is None: @@ -264,7 +297,9 @@ def kebab_fields(cls) -> List[str]: """Return a list of fields in kebab-case.""" return [f.name.replace("_", "-") for f in fields(cls)] - def find_base_image(self, img: BaseImage) -> Optional[BaseImageManifest]: + def find_base_image( + self, img: BaseImage + ) -> Optional[Union["BaseImageManifest", "BlobImageManifest"]]: """Find a base image by its name.""" for base_image in self.base_images: if base_image.image == img: diff --git a/tests/images/builder/cli.py b/tests/images/builder/cli.py index 741f0c2396..39e8e9aadd 100644 --- a/tests/images/builder/cli.py +++ b/tests/images/builder/cli.py @@ -1,6 +1,7 @@ import argparse from enum import Enum import logging +import os from pathlib import Path from typing import List @@ -183,7 +184,8 @@ def setup_parser_download_image( ) -> None: parser_download_img = subparsers.add_parser( SubCommand.DOWNLOAD_IMAGE.value, - help="Download a base image from the Azure DevOps feed", + help="Download a base image (from the Azure DevOps feed, or from " + "Azure Storage Blob for distros without a published feed).", ) parser_download_img.set_defaults(artifacts=artifacts) parser_download_img.add_argument( @@ -191,6 +193,22 @@ def setup_parser_download_image( help="The image to download", choices=[c.image.name for c in artifacts.base_images], ) + parser_download_img.add_argument( + "--blob-storage-account", + default=os.environ.get("BLOB_STORAGE_ACCOUNT"), + help="Azure Storage account name to pull blob-sourced base images " + "from. Required when downloading an image whose manifest is a " + "BlobImageManifest. Falls back to the BLOB_STORAGE_ACCOUNT env " + "var. Not used for ADO-feed base images.", + ) + parser_download_img.add_argument( + "--blob-container", + default=os.environ.get("BLOB_CONTAINER"), + help="Azure Storage container name to pull blob-sourced base " + "images from. Required when downloading an image whose manifest " + "is a BlobImageManifest. Falls back to the BLOB_CONTAINER env " + "var. Not used for ADO-feed base images.", + ) def setup_parser_matrix( @@ -285,6 +303,8 @@ def run_cmd( run.download_base_image( artifacts=args.artifacts, name=args.image, + blob_storage_account=args.blob_storage_account, + blob_container=args.blob_container, ) elif subcommand == SubCommand.MATRIX: run.generate_matrix( diff --git a/tests/images/builder/download.py b/tests/images/builder/download.py index 6f9db4c9f0..56a1313af0 100644 --- a/tests/images/builder/download.py +++ b/tests/images/builder/download.py @@ -1,9 +1,15 @@ +import json +import logging +import os +import re from pathlib import Path import shutil import subprocess import tempfile -from builder import BaseImageManifest +from builder import BaseImageManifest, BlobImageManifest + +log = logging.getLogger(__name__) def download_base_image(image: BaseImageManifest) -> None: @@ -39,3 +45,140 @@ def download_base_image(image: BaseImageManifest) -> None: # Copy the .vhdx file to the target location shutil.copy2(vhdx_files[0], image.image.path) + + +# Constrain blob filename selection to a date-prefixed shape so a stray +# blob with a name that lexically sorts last (`zzz-evil/image.vhdx`) +# cannot win selection. Matches `YYYYMMDD/` or `YYYY-MM-DD/`-style +# version prefixes, which is the upstream publisher's convention. +# +# This is defense against a broader governance issue: the storage account +# is owned by another team, so write access is out of Trident's control. +# The regex narrows the attack surface to "names matching this shape" +# while still letting us track the latest published version. Tracked +# longer-term in the AZL4 supply-chain governance discussion. +_BLOB_NAME_VERSION_RE = re.compile(r"/([^/]*\d{4}-?\d{2}-?\d{2}[^/]*)/") + + +def download_blob_image( + image: BlobImageManifest, + storage_account: str, + container: str, +) -> None: + """Download a base image from Azure Storage Blob. + + Lists blobs under `image.path_prefix`, filters to ones whose name + matches a date-prefixed version pattern AND ends with + `image.file_suffix`, picks the lexically largest (= most recent + date), and downloads it atomically to `image.image.path`. + + Requires `az` CLI with a logged-in identity that has read access + to the storage account. Uses `--auth-mode login` so no storage + keys are needed. + """ + if not storage_account or not container: + raise RuntimeError( + f"Blob storage account/container required to download " + f"'{image.image.name}'. Pass --blob-storage-account and " + f"--blob-container, or set BLOB_STORAGE_ACCOUNT and " + f"BLOB_CONTAINER env vars." + ) + + az = shutil.which("az") + if az is None: + raise RuntimeError( + "az CLI not found on PATH; required to fetch blob-sourced " + "base images. Install azure-cli." + ) + + log.info( + f"Listing blobs in '{storage_account}/{container}' under " + f"prefix '{image.path_prefix}/'" + ) + # No `--query` interpolation: do the filtering in Python so caller + # control of `image.file_suffix` (or any other field that might + # become externally settable later) cannot inject JMESPath. + list_proc = subprocess.run( + [ + az, + "storage", + "blob", + "list", + "--auth-mode", + "login", + "--account-name", + storage_account, + "--container-name", + container, + "--prefix", + f"{image.path_prefix}/", + "--query", + "[].name", + "-o", + "json", + ], + check=True, + capture_output=True, + text=True, + ) + all_names = json.loads(list_proc.stdout) + suffix = image.file_suffix + eligible = [ + n for n in all_names if n.endswith(suffix) and _BLOB_NAME_VERSION_RE.search(n) + ] + if not eligible: + raise RuntimeError( + f"No date-versioned blobs ending with '{suffix}' found under " + f"'{image.path_prefix}/' in '{storage_account}/{container}' " + f"(saw {len(all_names)} total blobs under the prefix)" + ) + + latest = sorted(eligible)[-1] + log.info(f"Latest: {latest}") + + image.image.path.parent.mkdir(parents=True, exist_ok=True) + + # Download to a sibling temp file then atomically rename. `az + # storage blob download` writes in place — if the step is killed + # (timeout / OOM / agent reboot) between create and complete, the + # next run sees a truncated VHDX and MIC fails with an opaque + # error. The temp-then-rename pattern guarantees the target either + # has the full bytes or doesn't exist. + target = image.image.path + fd, tmp_path = tempfile.mkstemp( + prefix=target.name + ".", + suffix=".part", + dir=str(target.parent), + ) + os.close(fd) + try: + subprocess.run( + [ + az, + "storage", + "blob", + "download", + "--auth-mode", + "login", + "--account-name", + storage_account, + "--container-name", + container, + "--name", + latest, + "--file", + tmp_path, + "--output", + "none", + ], + check=True, + ) + os.replace(tmp_path, target) + except BaseException: + # On any failure, remove the temp file so we don't leave + # partial-state debris next to the final path. + try: + os.unlink(tmp_path) + except FileNotFoundError: + pass + raise diff --git a/tests/images/builder/run.py b/tests/images/builder/run.py index d465beb2f2..8c93bdcb1e 100644 --- a/tests/images/builder/run.py +++ b/tests/images/builder/run.py @@ -3,7 +3,7 @@ import json from typing import List, Optional -from builder import ImageConfig, RpmSources, ArtifactManifest +from builder import ArtifactManifest, BlobImageManifest, ImageConfig, RpmSources from .builder import build_image from .convert import convert_image from . import download @@ -148,6 +148,8 @@ def download_base_image( *, artifacts: ArtifactManifest, name: str, + blob_storage_account: Optional[str] = None, + blob_container: Optional[str] = None, ) -> None: image_manifest = next( (img for img in artifacts.base_images if img.image.name == name), None @@ -155,7 +157,15 @@ def download_base_image( if image_manifest is None: raise ValueError(f"Image '{name}' not found in artifacts") log.info(f"Downloading base image '{name}' to '{image_manifest.image.path}'") - download.download_base_image(image_manifest) + + if isinstance(image_manifest, BlobImageManifest): + download.download_blob_image( + image_manifest, + storage_account=blob_storage_account, + container=blob_container, + ) + else: + download.download_base_image(image_manifest) def generate_matrix( diff --git a/tests/images/testimages.py b/tests/images/testimages.py index 9ab341cba9..b4d8b5416d 100755 --- a/tests/images/testimages.py +++ b/tests/images/testimages.py @@ -7,6 +7,7 @@ ArtifactManifest, BaseImage, BaseImageManifest, + BlobImageManifest, ImageConfig, OutputFormat, SystemArchitecture, @@ -132,6 +133,47 @@ config_file="base/updateimg-grub.yaml", ssh_key="files/id_rsa.pub", ), + ImageConfig( + # AZL4 (Fedora-derived) variant of trident-vm-grub-testimage. + # The base VHDX is pulled from Azure Storage (see + # BlobImageManifest below) since there is no AzureLinuxArtifacts + # ADO feed entry for AZL4 yet. The Trident binary is baked in + # via additionalFiles because the trident-service RPM is not + # yet packaged for AZL4. + "trident-vm-grub-testimage-azl4", + base_image=BaseImage.AZL4_QEMU_GUEST, + config="trident-vm-testimage", + config_file="base/updateimg-grub-azl4.yaml", + ssh_key="files/id_rsa.pub", + # No trident-service RPM for AZL4 yet — the binary is delivered + # via additionalFiles. extra_dependencies enforces both binaries + # are in place before the image is built (osmodifier is delivered + # the same way until an AZL4 RPM exists; see + # tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml + # for the additionalFiles entries that consume both paths). + requires_trident=False, + extra_dependencies=[ + Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), + Path("tests/images/trident-vm-testimage/base/osmodifier-bin/osmodifier"), + ], + ), + ImageConfig( + # AZL4 BASE qcow2: a bootable disk with the AZL4 OS plus trident + # installed, so storm-trident rollback testing can boot a VM and + # immediately drive A/B updates targeting the .cosi above. + # Mirrors AZL3's `make artifacts/trident-vm-grub-testimage.qcow2` + # path. See baseimg-grub-azl4.yaml for the layout / package set. + "trident-vm-grub-testimage-azl4-base", + base_image=BaseImage.AZL4_QEMU_GUEST, + config="trident-vm-testimage", + config_file="base/baseimg-grub-azl4.yaml", + output_format=OutputFormat.QCOW2, + ssh_key="files/id_rsa.pub", + requires_trident=False, + extra_dependencies=[ + Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), + ], + ), ImageConfig( "trident-vm-grub-verity-testimage", base_image=BaseImage.QEMU_GUEST, @@ -246,6 +288,23 @@ package_name="minimal_vhdx-3.0-stable", version="*", ), + BlobImageManifest( + # Azure Linux 4.0 base VHDX from the AZL preview gallery's + # backing storage. Pinned to a specific daily build — bump + # the version segment in path_prefix to pick up a newer one. + # + # Source gallery: + # azlpubDevGallery2mruiyvi / azure-linux-4-daily-x64 + # subscription e4ab81f8-030f-4593-a8f2-3ea2c7630a19 + # RG azl-acg-preview-publishing + # + # Storage account + container are supplied at runtime via + # --blob-storage-account / --blob-container CLI flags or + # the BLOB_STORAGE_ACCOUNT / BLOB_CONTAINER env vars. + image=BaseImage.AZL4_QEMU_GUEST, + path_prefix="staging/azure-linux-4-daily-x64/4.0.2026051502", + file_suffix=".vhdfixed", + ), ], ) From eae6848b8d41ab8aa29951a77393b863887d46c2 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 18:26:53 -0700 Subject: [PATCH 15/42] fix: Tag MCR MIC container with local short name after pull testimages.py runs docker with the short tag (imagecustomizer:1.4.0-1) but docker pull uses the full MCR path. Without a local tag, docker run fails with 'pull access denied'. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../stages/build_image/build-image-template-azl4.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml index 77f26a7c41..31e163b596 100644 --- a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -91,10 +91,12 @@ steps: workingDirectory: ${{ parameters.tridentSourceDirectory }} # Pull the released MIC container from MCR. AZL4 support is included - # in imagecustomizer >= 1.4.0. + # in imagecustomizer >= 1.4.0. Tag it locally so testimages.py can + # reference it by short name. - bash: | set -euxo pipefail docker pull "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" + docker tag "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" "${{ parameters.micContainerTag }}" displayName: "Pull MIC container from MCR" # Stage the pipeline-wide SSH key into the testimage tree before From 73835d5fcbf42a823a52b4655103d6875dadb99a Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 12:47:55 -0700 Subject: [PATCH 16/42] docs: Update TODOs to not assume AzureLinuxArtifacts feed for AZL4 AZL4 base VHDXes may continue to come from blob storage rather than the ADO feed. The trident-service RPM will come from an AZL4 package repo, not ADO. Update comments to reflect this. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../stages/build_image/build-image-azl4.yml | 6 ++++-- .../build_image/build-image-template-azl4.yml | 18 +++++++++--------- tests/images/SERVICE-CONNECTION-RUNBOOK.md | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.pipelines/templates/stages/build_image/build-image-azl4.yml b/.pipelines/templates/stages/build_image/build-image-azl4.yml index 0fae10eb29..a2901cd842 100644 --- a/.pipelines/templates/stages/build_image/build-image-azl4.yml +++ b/.pipelines/templates/stages/build_image/build-image-azl4.yml @@ -5,8 +5,10 @@ # external test-images repo template. # # TODO(azl4-merge-back): Merge this back into build-image.yml with an -# `azureLinuxVersion` parameter switch once AZL4 has feed-published base VHDXes -# and RPMs. +# `azureLinuxVersion` parameter switch once AZL4 base VHDX acquisition +# and trident-service RPM packaging are resolved. The base VHDX may +# continue to come from blob storage (not the AzureLinuxArtifacts ADO +# feed); the RPM will come from an AZL4 package repo, not ADO. parameters: - name: imageName diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml index 31e163b596..7b679b0848 100644 --- a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -2,20 +2,20 @@ # # Forked from build-image-template.yml on 2026-05-13. The AZL3 path pulls the # base VHDX from the AzureLinuxArtifacts ADO feed and the Trident RPM from the -# trident-binaries pipeline artifact, then runs `testimages.py build`. None of -# that works for AZL4 today because: +# trident-binaries pipeline artifact, then runs `testimages.py build`. AZL4 +# uses different acquisition paths: # -# 1. There is no AzureLinuxArtifacts feed entry for AZL4 base VHDX. We -# download from the AZL preview gallery's backing storage account -# (azlpubdev2mruiyvi/images-dev) instead. See the BlobImageManifest +# 1. Base VHDX comes from the AZL preview gallery's backing storage +# (azlpubdev2mruiyvi/images-dev). See the BlobImageManifest # registration in tests/images/testimages.py. # -# 2. There is no Trident RPM for AZL4. The binary is baked in via +# 2. There is no Trident RPM for AZL4 yet. The binary is baked in via # additionalFiles in tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml. # -# TODO(azl4-merge-back): When AZL4 has feed-published base VHDXes and RPMs, -# fold this template back into build-image-template.yml by adding a -# `azureLinuxVersion: "4.0"` branch. +# TODO(azl4-merge-back): Fold this template back into build-image-template.yml +# once the AZL4 base VHDX and trident-service RPM acquisition paths are +# standardized. The base VHDX may stay as a blob download; the RPM will +# come from an AZL4 package repo. parameters: - name: tridentSourceDirectory diff --git a/tests/images/SERVICE-CONNECTION-RUNBOOK.md b/tests/images/SERVICE-CONNECTION-RUNBOOK.md index 2a17d49d2d..fe448ae4b2 100644 --- a/tests/images/SERVICE-CONNECTION-RUNBOOK.md +++ b/tests/images/SERVICE-CONNECTION-RUNBOOK.md @@ -220,6 +220,6 @@ automatically; FICs are removed with the parent UAMI. | Created | 2026-05-14 | | Updated | 2026-06-01 (re-scoped from `maritimusgithubstorage` to `azlpubdev2mruiyvi`) | -When the `AzureLinuxArtifacts` ADO feed publishes AZL4 base VHDXes, -this connection can be deleted — the standard `BaseImageManifest` -download path will handle it. +When AZL4 base VHDX acquisition is standardized (either via the +`AzureLinuxArtifacts` ADO feed or a permanent blob location), this +connection can be re-evaluated. From 9dabb187ef8042494e7d3b0137dd1907ae6412cb Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 12:52:41 -0700 Subject: [PATCH 17/42] fix: Remove SERVICE-CONNECTION-RUNBOOK from public repo Contains internal infrastructure details (UAMI names, principal IDs, subscription IDs, FIC configuration) that should not be published to a public GitHub repository. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/images/SERVICE-CONNECTION-RUNBOOK.md | 225 --------------------- 1 file changed, 225 deletions(-) delete mode 100644 tests/images/SERVICE-CONNECTION-RUNBOOK.md diff --git a/tests/images/SERVICE-CONNECTION-RUNBOOK.md b/tests/images/SERVICE-CONNECTION-RUNBOOK.md deleted file mode 100644 index fe448ae4b2..0000000000 --- a/tests/images/SERVICE-CONNECTION-RUNBOOK.md +++ /dev/null @@ -1,225 +0,0 @@ -# ADO Service Connection Runbook — UAMI + Workload Identity Federation - -Step-by-step recipe for creating an ADO Azure Resource Manager service -connection authenticated by a User-Assigned Managed Identity (UAMI) via -Workload Identity Federation (WIF). This is the SFI-compliant pattern; no -secrets are stored anywhere. - -Adapted from Brian's wiki [Creating an ADO Service Connection authenticated -with UMI](https://dev.azure.com/mariner-org/mariner/_wiki/wikis/mariner.wiki/5697/Creating-an-ADO-Service-Connection-authenticated-with-UMI), -with the concrete commands and gotchas from setting up the -`trident-azl4-blob-reader` connection on 2026-05-14. - -## What you end up with - -``` -Azure UAMI ─(federated)→ ADO Service Connection ─(used by)→ Pipeline - │ - └─(role assignment)→ Target Azure resource -``` - -The pipeline uses `AzureCLI@2` referencing the SC. ADO mints an OIDC token, -exchanges it for an Azure access token via the UAMI's federated credential, -and the pipeline gets an `az login`'d session with the UAMI's RBAC. - -## Prerequisites - -- **Azure:** Contributor on the resource group where you'll create the UAMI -- **Azure:** User Access Administrator or Owner on the target resource you're - granting access to (for the role assignment) -- **ADO:** Project Administrator on the project where the service connection - will live - -## Step 1 — Create the UAMI (Azure CLI) - -```powershell -$sub = "" -$rg = "" -$loc = "" # match siblings if reusing an RG -$umi = "" # naming convention: see notes below - -az account set -s $sub - -# Pre-flight: confirm UAMI doesn't already exist -az identity show -g $rg -n $umi 2>$null -# (should return nothing) - -# Create -az identity create -g $rg -n $umi -l $loc ` - --tags purpose= owner= project= -``` - -The output contains `clientId` (use as ADO's Application ID later) and -`principalId` (use as the role-assignment assignee). - -### Naming convention notes - -Match what's already in the RG. Examples from -`maritimus-github-runner` (b3e01d89... sub): - -- `maritimus-github-runner-umi-*` for GitHub Actions identities -- `maritimus-github-storage-ado-*-umi` for ADO pipeline identities - -When in doubt, ask the RG owner before deviating. - -## Step 2 — Grant the UAMI access to the target resource - -For the trident-azl4-blob-reader UAMI, the target was the -`azlpubdev2mruiyvi` storage account (backing the AZL preview gallery), -with `Storage Blob Data Reader` (least privilege — we only need to read -base VHDXes). - -```powershell -$objId = az identity show -g $rg -n $umi --query principalId -o tsv -$scope = "/subscriptions/$sub/resourceGroups/$rg/providers///" - -az role assignment create ` - --assignee-object-id $objId ` - --assignee-principal-type ServicePrincipal ` - --role "" ` - --scope $scope - -# Verify -az role assignment list --assignee $objId --all -o table -``` - -**Always use least privilege.** Don't pick `Owner` when `Reader` will do. - -## Step 3 — Start service connection in ADO (do NOT click Verify yet) - -In ADO project → Project Settings → Service Connections → New service -connection. - -| Field | Value | -|---|---| -| Connection type | **Azure Resource Manager** | -| Identity type | **App registration or managed identity (manual)** | -| Credential | **Workload Identity Federation** | -| Scope Level | **Subscription** | -| Subscription ID | `` | -| Subscription Name | `` | -| **Application (client) ID** | the UAMI's **clientId** from step 1 | -| Tenant ID | `72f988bf-86f1-41af-91ab-2d7cd011db47` (MSIT) | -| Service connection name | `` | -| Grant access permission to all pipelines | **uncheck** (see SFI note below) | - -After filling these in but **before saving**, ADO shows you: - -- **Issuer URL** -- **Subject identifier** - -Both are needed for step 4. Keep this ADO tab open. - -### Issuer/Subject gotcha — read them off the form - -⚠️ Do NOT guess these values. They are not the same as `vstoken.dev.azure.com/...` -that older service connections may show. ADO assigns a new pair when you -create the SC, and the issuer is the Entra tenant authority URL -(`https://login.microsoftonline.com//v2.0`), not the ADO token -issuer URL. The subject is opaque (looks like -`/eid1/c/pub/t/.../sc/.../`). - -Copy the exact strings from the ADO form into the FIC. Do not transcribe; -copy-paste. - -## Step 4 — Add the federated credential to the UAMI - -```powershell -$issuer = "" -$subject = "" - -az identity federated-credential create ` - -g $rg ` - --identity-name $umi ` - --name "" ` - --issuer "$issuer" ` - --subject "$subject" ` - --audiences "api://AzureADTokenExchange" - -# Verify -az identity federated-credential list -g $rg --identity-name $umi -o table -``` - -FIC name should describe the consumer. For ADO connections we use -`ado--` (e.g. `ado-ecf-trident-azl4-blob-reader`). - -## Step 5 — Verify and save in ADO - -Wait ~30 seconds for Entra to propagate the FIC, then return to the ADO -form and click **Verify and save**. - -### Common errors - -**`AADSTS70025: client has no configured federated identity credentials`** -- The FIC hasn't been added yet. Run step 4. - -**`AADSTS700211: No matching federated identity record found for presented -assertion issuer 'https://login.microsoftonline.com//v2.0'`** -- The FIC exists but the issuer or subject doesn't match what ADO is - presenting. Re-read the ADO form carefully (do not transcribe — copy). -- A common mistake is reusing the issuer URL from an unrelated existing - service connection. Each new SC may get its own issuer string. - -**Verify succeeds but pipeline fails with `You do not have the required -permissions...`** -- The role assignment in step 2 either targeted the wrong scope, or - Azure RBAC hasn't propagated yet (wait up to 10 minutes). Re-check that - `az role assignment list --assignee --all` shows the role - on the correct scope. - -## Step 6 — SFI compliance — restrict pipeline permissions - -[SFI-ES2.4.11](https://eng.ms/docs/coreai/devdiv/one-engineering-system-1es/1es-docs/1es-security-configuration/azdo-config-remediation/all-pipeline-access-es-2-4-tsg) -prohibits leaving a service connection accessible to all pipelines. - -After saving: - -1. Open the new service connection in ADO -2. Click **More options (⋮) → Security** -3. Under **Pipeline permissions**, click **Restrict permission** -4. Click **+** and add each pipeline that needs the SC by ID/name. Do not - add "all pipelines." - -## When to use the manual cleanup path - -If something goes wrong mid-setup and you need to start over cleanly: - -```powershell -# Remove an FIC that pointed at the wrong issuer/subject -az identity federated-credential delete -g $rg --identity-name $umi --name "" --yes - -# Confirm no stray role assignments -az role assignment list --assignee --all -o table - -# In ADO: delete the SC via Project Settings → Service connections → ⋮ → Delete -# In Azure: only delete the UAMI itself if you're sure nothing else uses it -``` - -The UAMI does no harm by itself — it's a managed identity with role -assignments and FICs. Deleting it cascades to role assignments -automatically; FICs are removed with the parent UAMI. - -## Reference — the trident-azl4-blob-reader connection - -| Field | Value | -|---|---| -| Purpose | Read AZL4 base VHDX from the AZL preview gallery's backing storage for trident CI | -| Storage account | `azlpubdev2mruiyvi` (subscription `e4ab81f8-030f-4593-a8f2-3ea2c7630a19`, RG `azl-acg-preview-publishing`) | -| Gallery source | `azlpubDevGallery2mruiyvi / azure-linux-4-daily-x64` (same subscription/RG) | -| UAMI name | `maritimus-github-storage-ado-trident-reader-umi` | -| UAMI subscription | `b3e01d89-bd55-414f-bbb4-cdfeb2628caa` (`AzureCNMP_CNP_AzureLinux_Polar_ImageTools_Staging`) | -| UAMI resource group | `maritimus-github-runner` | -| UAMI region | `westus2` | -| UAMI clientId | `5eaafbf5-279b-4f16-b797-50bd730dcdb8` | -| UAMI principalId | `97c7c5f1-db58-4e65-8c4a-b6d614a72657` | -| Role granted | `Storage Blob Data Reader` on `azlpubdev2mruiyvi` | -| FIC name | `ado-ecf-trident-azl4-blob-reader` | -| ADO project | `mariner-org/ECF` | -| ADO SC name | `trident-azl4-blob-reader` | -| Pipelines allowed | `[GITHUB]-trident-pr-e2e`, `[GITHUB]-trident-ci`, `[GITHUB]-trident-pr-e2e-azure` | -| Created | 2026-05-14 | -| Updated | 2026-06-01 (re-scoped from `maritimusgithubstorage` to `azlpubdev2mruiyvi`) | - -When AZL4 base VHDX acquisition is standardized (either via the -`AzureLinuxArtifacts` ADO feed or a permanent blob location), this -connection can be re-evaluated. From f81d73e3faa1dfd996967fce67a0085a8605cfe3 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 12:58:40 -0700 Subject: [PATCH 18/42] docs: Trim verbose CLI help strings in testimages.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/images/builder/cli.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tests/images/builder/cli.py b/tests/images/builder/cli.py index 39e8e9aadd..784c4c534d 100644 --- a/tests/images/builder/cli.py +++ b/tests/images/builder/cli.py @@ -184,30 +184,25 @@ def setup_parser_download_image( ) -> None: parser_download_img = subparsers.add_parser( SubCommand.DOWNLOAD_IMAGE.value, - help="Download a base image (from the Azure DevOps feed, or from " - "Azure Storage Blob for distros without a published feed).", + help="Download a base image.", ) parser_download_img.set_defaults(artifacts=artifacts) parser_download_img.add_argument( "image", - help="The image to download", + help="The image to download.", choices=[c.image.name for c in artifacts.base_images], ) parser_download_img.add_argument( "--blob-storage-account", default=os.environ.get("BLOB_STORAGE_ACCOUNT"), - help="Azure Storage account name to pull blob-sourced base images " - "from. Required when downloading an image whose manifest is a " - "BlobImageManifest. Falls back to the BLOB_STORAGE_ACCOUNT env " - "var. Not used for ADO-feed base images.", + help="Azure Storage account name for blob-sourced images. " + "Env: BLOB_STORAGE_ACCOUNT.", ) parser_download_img.add_argument( "--blob-container", default=os.environ.get("BLOB_CONTAINER"), - help="Azure Storage container name to pull blob-sourced base " - "images from. Required when downloading an image whose manifest " - "is a BlobImageManifest. Falls back to the BLOB_CONTAINER env " - "var. Not used for ADO-feed base images.", + help="Azure Storage container name for blob-sourced images. " + "Env: BLOB_CONTAINER.", ) From 28b09d1cd8e4cf5498f191d04a75ed2185a0ea45 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:25 -0700 Subject: [PATCH 19/42] infra: Add AZL4 COSI image config, pipeline stages, and E2E configs Adds AZL4 E2E pipeline parameters, COSI update-image YAML config, test-image helper scripts, and base/rollback trident configurations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/templates/e2e-template.yml | 38 +++++ .../base-azl4/test-selection.yaml | 5 + .../base-azl4/trident-config.yaml | 68 ++++++++ .../rollback-azl4/test-selection.yaml | 3 + .../rollback-azl4/trident-config.yaml | 84 +++++++++ tests/images/trident-vm-testimage/README.md | 46 +++++ .../base/files/hostname-shim.sh | 20 +++ .../base/files/regen-sshd-keys.service | 14 ++ .../base/scripts/enable-regen-sshd-keys.sh | 7 + .../base/scripts/ssh-move-host-keys-azl4.sh | 13 ++ .../base/updateimg-grub-azl4.yaml | 161 ++++++++++++++++++ 11 files changed, 459 insertions(+) create mode 100644 tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml create mode 100644 tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml create mode 100644 tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml create mode 100644 tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml create mode 100644 tests/images/trident-vm-testimage/base/files/hostname-shim.sh create mode 100644 tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service create mode 100755 tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh create mode 100755 tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh create mode 100644 tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml diff --git a/.pipelines/templates/e2e-template.yml b/.pipelines/templates/e2e-template.yml index a0654303a4..1dfd69fb7a 100644 --- a/.pipelines/templates/e2e-template.yml +++ b/.pipelines/templates/e2e-template.yml @@ -224,6 +224,44 @@ stages: micVersion: ${{ parameters.micVersion }} dependsOnStage: ${{ parameters.baseImageArtifactStage }} + # Build the AZL4 test image (pinned-MIC path). + # + # TODO(azl4-release): Drop the bespoke build-image-azl4.yml call once AZL4 + # has feed-published base VHDXes, RPMs, and a released MIC container. + # Then this can be a plain build-image.yml call with an azureLinuxVersion + # parameter, matching the other testimage stages. + # + # Gating mirrors the AzL installer ISO below so AZL4 build runs in every + # stage type that gates a trunk merge. Previously this only ran on + # pr-e2e / ci / pr-e2e-azure, which silently skipped AZL4 in + # azl-validation / full-validation — exactly the stage you'd want it. + - ${{ if or(eq(parameters.stageType, 'pr-e2e'), eq(parameters.stageType, 'ci'), eq(parameters.stageType, 'pr-e2e-azure'), eq(parameters.stageType, 'azl-validation'), eq(parameters.stageType, 'full-validation')) }}: + - template: stages/build_image/build-image-azl4.yml + parameters: + imageName: trident-vm-grub-testimage-azl4 + dependsOnStage: ${{ parameters.baseImageArtifactStage }} + + # AZL4 base qcow2 — boot point for the VM offline-init / rollback + # path. Same build template as the COSI above; output_format + # differs (QCOW2 vs COSI) per the testimages.py registration. + - template: stages/build_image/build-image-azl4.yml + parameters: + imageName: trident-vm-grub-testimage-azl4-base + dependsOnStage: ${{ parameters.baseImageArtifactStage }} + + # AZL4 BM-simulated netlaunch test. Uses the AZL3 MOS installer ISO + # (built by TridentTestImg_trident_installer below) plus the AZL4 + # COSI built above. Trident runs from the live MOS environment and + # installs the AZL4 COSI onto a fresh virtdeploy VM disk. This is + # the same flow we proved out manually on karhu-ubuntu. + - template: stages/testing_vm/netlaunch-testing-azl4.yml + + # AZL4 VM offline-init rollback test. The base qcow2 already has + # trident's datastore populated by its first-boot offline-init + # oneshot, so storm-trident can drive A/B update + rollback against + # the AZL4 COSI without the MOS bridge. + - template: stages/testing_rollback/vm-testing-azl4.yml + # Build AzL installer ISO (attended and unattended) - ${{ if or(eq(parameters.stageType, 'pr-e2e'), eq(parameters.stageType, 'ci'), eq(parameters.stageType, 'pr-e2e-azure'), eq(parameters.stageType, 'azl-validation')) }}: - template: stages/azl_installer/azl-installer.yml diff --git a/tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml b/tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml new file mode 100644 index 0000000000..1789997bd2 --- /dev/null +++ b/tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml @@ -0,0 +1,5 @@ +compatible: + - base-azl4 + # Reuse the same pytest assertions as the AZL3 `base` scenario where + # appropriate. Add this scenario explicitly to test markers as we wire + # up pytest coverage. diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml new file mode 100644 index 0000000000..a3aac68b95 --- /dev/null +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -0,0 +1,68 @@ +image: + url: http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi + sha384: ignored +# Note: AZL4 doesn't ship grub2-efi-binary-noprefix. We deliberately do +# not set `internalParams.disableGrubNoprefixCheck` here — trident +# auto-detects AZL4 (via `is_azl4_or_later` in +# crates/trident/src/subsystems/esp.rs) and skips the check itself, so +# this scenario exercises the auto-detection path that real customers +# will hit. +storage: + disks: + - id: os + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-2 + partitionTableType: gpt + partitions: + - id: root-a + type: root + size: 8G + - id: root-b + type: root + size: 8G + - id: esp + type: esp + size: 1G + - id: trident + type: linux-generic + size: 1G + - id: disk2 + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-3 + partitionTableType: gpt + partitions: [] + abUpdate: + volumePairs: + - id: root + volumeAId: root-a + volumeBId: root-b + filesystems: + - deviceId: trident + source: new + mountPoint: /var/lib/trident + - deviceId: esp + mountPoint: + path: /boot/efi + options: umask=0077 + - deviceId: root + mountPoint: / +# AZL4 baseline scenario for the pytest E2E framework. Mirrors the AZL3 +# `base/` scenario as closely as possible while staying inside what PR-4's +# native hostname-carry-over fast path can serve. +# +# Why no `os:` section yet: +# The MOS install ISO (built from tests/images/trident-mos/iso.yaml) +# does not include /usr/bin/osmodifier. PR-5 bakes osmodifier into the +# target image so post-install Trident operations (update, runtime +# apply) can drive os.users / os.selinux / os.netplan via osmodifier, +# but the install-time validation runs in the MOS environment which +# currently lacks the binary. Until the MOS ISO is rebuilt with +# azurelinux-image-tools-osmodifier installed (a small follow-up), the +# install path must stick to PR-4's hostname-only fast path. +# +# Once the MOS includes osmodifier, this file can grow to mirror `base/` +# more completely (os.users, os.selinux, os.netplan). +# +# Other differences from base/: +# - No swap or /home partitions (kept simple for the first AZL4 scenario; +# swap support is its own follow-up and /home isn't load-bearing here). +# - No postConfigure sudo grant: the testing-user is added to wheel by +# the testimage MIC config and /etc/sudoers.d/wheel grants nopasswd. diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml new file mode 100644 index 0000000000..cbfa81bbe7 --- /dev/null +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml @@ -0,0 +1,3 @@ +compatible: + - rollback + - rollback-azl4 diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml new file mode 100644 index 0000000000..e833209068 --- /dev/null +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml @@ -0,0 +1,84 @@ +image: + url: http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi + sha384: ignored +storage: + disks: + - id: os + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-2 + partitionTableType: gpt + partitions: + - id: root-a + type: root + size: 8G + - id: root-b + type: root + size: 8G + - id: esp + type: esp + size: 1G + - id: trident + type: linux-generic + size: 1G + - id: disk2 + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-3 + partitionTableType: gpt + partitions: [] + abUpdate: + volumePairs: + - id: root + volumeAId: root-a + volumeBId: root-b + filesystems: + - deviceId: trident + source: new + mountPoint: /var/lib/trident + - deviceId: esp + mountPoint: + path: /boot/efi + options: umask=0077 + - deviceId: root + mountPoint: / +os: + additionalFiles: + - destination: /var/lib/trident/local-health-check-file.sh + content: | + echo 'This is a local health check script.' + exit 0 +health: + checks: + - name: invoke-rollback-from-local-script + runOn: + - clean-install + path: /var/lib/trident/local-health-check-file.sh + - name: invoke-rollback-from-script + runOn: + - clean-install + content: | + exit 1 + - name: install-failure-systemd-check + runOn: + - clean-install + systemdServices: + - non-existent-service1.service + - non-existent-service2.service + timeoutSeconds: 15 +# AZL4 variant of the AZL3 `health-checks-install/` scenario. Adapted for the +# PR-4 hostname-only fast path: +# - Empty top-level `users`/`selinux`/`netplan` so install validation does +# not require the OS Modifier binary to be in the MOS install ISO (which +# does not currently include it; once the MOS rebuild lands, both this +# scenario and base-azl4 can grow os.users / os.selinux / os.netplan). +# - `os.additionalFiles` is the one os.* field used because health.checks +# references `path: /var/lib/trident/local-health-check-file.sh`, which +# needs to be on the target filesystem. additionalFiles is processed by +# Trident's storage / file-deploy paths, not by OS Modifier. +# +# Health-check failure expectations (asserted by tests/e2e_tests/rollback_test.py): +# - State transitions to `not-provisioned` (clean-install has no slot to +# roll back to; the install just fails). +# - `/var/lib/trident/trident-health-check-failure-*.log` is created. +# - The log contains: +# * `"Failed health check(s)"` +# * `"Script 'invoke-rollback-from-script' failed"` +# * `"Unit non-existent-service1.service could not be found"` +# * `"Unit non-existent-service2.service could not be found"` diff --git a/tests/images/trident-vm-testimage/README.md b/tests/images/trident-vm-testimage/README.md index e527ae04ee..7d4379ed5b 100644 --- a/tests/images/trident-vm-testimage/README.md +++ b/tests/images/trident-vm-testimage/README.md @@ -35,3 +35,49 @@ To build the update images, run: | ----------- | --------------------------------------- | ----------------------------------- | | Regular | `make trident-vm-grub-testimage` | `artifacts/trident-vm-grub-testimage/*` | | With verity | `make trident-vm-grub-verity-testimage` | `artifacts/trident-vm-grub-testimage/*` | + +## AZL4 variant (`trident-vm-grub-testimage-azl4`) + +A Fedora-derived (Azure Linux 4.0) variant lives alongside the AZL3 image +above. It uses `base/updateimg-grub-azl4.yaml` instead of +`base/updateimg-grub.yaml` and consumes `BaseImage.AZL4_QEMU_GUEST`. + +### Two extra prerequisites for AZL4 + +1. **AZL4 base VHDX.** No prebuilt AZL4 VHDX is available in the ADO + Artifacts feed yet, so build one locally with Image Customizer: + + ```bash + sudo imagecustomizer create \ + --config-file path/to/azl4-qemu-guest.yaml \ + --rpm-source path/to/azl4.repo \ + --tools-file path/to/azl4-tools.tar.gz \ + --build-dir /tmp/azl4-base-build \ + --output-image-file artifacts/azl4_qemu_guest.vhdx \ + --output-image-format vhdx \ + --distro azurelinux --distro-version 4.0 + ``` + + See `wiki/playbooks/trident-azl4-e2e-manual.md` in the karhu repo for + a ready-to-paste base config and the alpha2 repo URL. + + When an AZL4 VHDX lands in the ADO feed, add a `BaseImageManifest` + entry for `AZL4_QEMU_GUEST` in `testimages.py` so `cli download` + fetches it the same way it does the AZL3 bases. + +2. **Trident binary baked in.** The AZL4 image bakes + `/usr/bin/trident` via `additionalFiles` because there is no + `trident-service` RPM packaged for AZL4 yet. Drop the built binary + at `base/trident-bin/trident` before invoking the builder: + + ```bash + mkdir -p base/trident-bin + cp base/trident-bin/trident + chmod +x base/trident-bin/trident + ``` + + The binary should be built from a stack including the AZL4 enabling + branches: `azl4-1-grub-native` + `azl4-2-esp-layouts` + + `azl4-3-configure-bls` + `azl4-4-osconfig-hostname`. Once those land + on main, a plain main build suffices. The `base/trident-bin/` + directory is gitignored. diff --git a/tests/images/trident-vm-testimage/base/files/hostname-shim.sh b/tests/images/trident-vm-testimage/base/files/hostname-shim.sh new file mode 100644 index 0000000000..b12b3807c9 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/files/hostname-shim.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# AZL4 doesn't ship a `hostname` binary in `coreutils` (Fedora moved it to +# its own package which AZL4 hasn't picked up yet). The pytest E2E +# framework uses `hostname` as a smoke test of the SSH session in +# tests/e2e_tests/conftest.py, so without this shim every test errors out +# at fixture setup. +# +# Tiny POSIX-only replacement that reads /etc/hostname, plus a passthrough +# for `hostname -s` and `hostname -f` for completeness. +case "$1" in + -s|--short) + cat /etc/hostname | cut -d. -f1 + ;; + -f|--fqdn|"") + cat /etc/hostname + ;; + *) + cat /etc/hostname + ;; +esac diff --git a/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service b/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service new file mode 100644 index 0000000000..0fe938ddc3 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service @@ -0,0 +1,14 @@ +[Unit] +Description=Generate sshd host keys in /var/srv on first boot +ConditionPathExists=!/var/srv/etc/ssh/ssh_host_ed25519_key +Before=sshd.service +After=local-fs.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStartPre=/usr/bin/mkdir -p /var/srv/etc/ssh +ExecStart=/usr/bin/ssh-keygen -A -f /var/srv -q + +[Install] +WantedBy=multi-user.target diff --git a/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh b/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh new file mode 100755 index 0000000000..bdf901cd2e --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# regen-sshd-keys is a one-shot service that generates SSH host keys in +# /var/srv on first boot. Enable it via wants symlink because the generic +# `services.enable` in MIC config is reserved for systemd unit names that +# come from packages, and our unit is delivered via additionalFiles. +ln -sf /etc/systemd/system/regen-sshd-keys.service \ + /etc/systemd/system/multi-user.target.wants/regen-sshd-keys.service diff --git a/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh new file mode 100755 index 0000000000..ede3fdbaa2 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# AZL4-compatible variant of ssh-move-host-keys.sh. +# +# AZL3 sshd reads the main /etc/ssh/sshd_config and we appended HostKey +# lines to it. AZL4 sshd 10.0+ supports drop-ins under /etc/ssh/sshd_config.d/ +# which is the cleaner approach. +SSH_VAR_DIR="/var/srv/etc/ssh" +mkdir -p /etc/ssh/sshd_config.d +cat > /etc/ssh/sshd_config.d/50-trident-host-keys.conf < Date: Mon, 8 Jun 2026 13:21:02 -0700 Subject: [PATCH 20/42] fix: Remove stale osmodifier additionalFile from updateimg osmodifier is now a Rust crate built into the trident binary (PR #638). No separate osmodifier binary needs to be baked into test images. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/images/testimages.py | 8 ++------ .../trident-vm-testimage/base/updateimg-grub-azl4.yaml | 9 --------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/tests/images/testimages.py b/tests/images/testimages.py index b4d8b5416d..e71f9fda07 100755 --- a/tests/images/testimages.py +++ b/tests/images/testimages.py @@ -146,15 +146,11 @@ config_file="base/updateimg-grub-azl4.yaml", ssh_key="files/id_rsa.pub", # No trident-service RPM for AZL4 yet — the binary is delivered - # via additionalFiles. extra_dependencies enforces both binaries - # are in place before the image is built (osmodifier is delivered - # the same way until an AZL4 RPM exists; see - # tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml - # for the additionalFiles entries that consume both paths). + # via additionalFiles. extra_dependencies enforces it is in place + # before the image is built. requires_trident=False, extra_dependencies=[ Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), - Path("tests/images/trident-vm-testimage/base/osmodifier-bin/osmodifier"), ], ), ImageConfig( diff --git a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml index b03677b65a..9cee3c8096 100644 --- a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml +++ b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml @@ -106,15 +106,6 @@ os: - source: trident-bin/trident destination: /usr/bin/trident permissions: "755" - # Bake the OS Modifier binary into /usr/local/bin/osmodifier. - # AZL4 does not yet ship azurelinux-image-tools-osmodifier as an RPM, - # but Vince Perri's MIC AZL4 branch builds an AZL4-aware binary. - # Drop the build at osmodifier-bin/osmodifier alongside the trident - # binary; this ride-along disappears once the AZL4 RPM is published. - # See the comment above on /usr/local/bin placement. - - source: osmodifier-bin/osmodifier - destination: /usr/bin/osmodifier - permissions: "755" # AZL4 lacks a /usr/bin/hostname binary; the pytest framework smoke- # tests SSH with `hostname`, so we ship a tiny shim. - source: files/hostname-shim.sh From 46842598c1ee5834ae63c1967e9365b561f529a3 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:26:30 -0700 Subject: [PATCH 21/42] docs: Update stale AZL4 build comment in e2e-template Remove references to pinned-MIC and feed-published assumptions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/templates/e2e-template.yml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/.pipelines/templates/e2e-template.yml b/.pipelines/templates/e2e-template.yml index 1dfd69fb7a..6322b480e9 100644 --- a/.pipelines/templates/e2e-template.yml +++ b/.pipelines/templates/e2e-template.yml @@ -224,17 +224,14 @@ stages: micVersion: ${{ parameters.micVersion }} dependsOnStage: ${{ parameters.baseImageArtifactStage }} - # Build the AZL4 test image (pinned-MIC path). + # Build the AZL4 test images. # - # TODO(azl4-release): Drop the bespoke build-image-azl4.yml call once AZL4 - # has feed-published base VHDXes, RPMs, and a released MIC container. - # Then this can be a plain build-image.yml call with an azureLinuxVersion - # parameter, matching the other testimage stages. + # Uses build-image-azl4.yml (MCR MIC + blob-sourced base VHDX) instead + # of the standard build-image.yml path. See build-image-azl4.yml for + # the merge-back TODO. # - # Gating mirrors the AzL installer ISO below so AZL4 build runs in every - # stage type that gates a trunk merge. Previously this only ran on - # pr-e2e / ci / pr-e2e-azure, which silently skipped AZL4 in - # azl-validation / full-validation — exactly the stage you'd want it. + # Gating mirrors the AzL installer ISO below so AZL4 builds run in + # every stage type that gates a trunk merge. - ${{ if or(eq(parameters.stageType, 'pr-e2e'), eq(parameters.stageType, 'ci'), eq(parameters.stageType, 'pr-e2e-azure'), eq(parameters.stageType, 'azl-validation'), eq(parameters.stageType, 'full-validation')) }}: - template: stages/build_image/build-image-azl4.yml parameters: From 58742b07841bf819291257b395fca5334935d2a4 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:32:16 -0700 Subject: [PATCH 22/42] docs: Simplify noprefix comment in base-azl4 config Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../trident_configurations/base-azl4/trident-config.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index a3aac68b95..9d2f5d8788 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -1,12 +1,8 @@ image: url: http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi sha384: ignored -# Note: AZL4 doesn't ship grub2-efi-binary-noprefix. We deliberately do -# not set `internalParams.disableGrubNoprefixCheck` here — trident -# auto-detects AZL4 (via `is_azl4_or_later` in -# crates/trident/src/subsystems/esp.rs) and skips the check itself, so -# this scenario exercises the auto-detection path that real customers -# will hit. +# AZL4 does not ship grub2-efi-binary-noprefix. Trident handles this +# automatically — no disableGrubNoprefixCheck override needed. storage: disks: - id: os From b450fca69899f8f4368f16da352226213a7c21d5 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:33:34 -0700 Subject: [PATCH 23/42] docs: Update stale base-azl4 config comment Remove references to dropped PRs, external osmodifier binary, and MOS ISO limitations that no longer apply. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../base-azl4/trident-config.yaml | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index 9d2f5d8788..11bc1bb095 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -41,24 +41,12 @@ storage: - deviceId: root mountPoint: / # AZL4 baseline scenario for the pytest E2E framework. Mirrors the AZL3 -# `base/` scenario as closely as possible while staying inside what PR-4's -# native hostname-carry-over fast path can serve. +# `base/` scenario with these differences: # -# Why no `os:` section yet: -# The MOS install ISO (built from tests/images/trident-mos/iso.yaml) -# does not include /usr/bin/osmodifier. PR-5 bakes osmodifier into the -# target image so post-install Trident operations (update, runtime -# apply) can drive os.users / os.selinux / os.netplan via osmodifier, -# but the install-time validation runs in the MOS environment which -# currently lacks the binary. Until the MOS ISO is rebuilt with -# azurelinux-image-tools-osmodifier installed (a small follow-up), the -# install path must stick to PR-4's hostname-only fast path. -# -# Once the MOS includes osmodifier, this file can grow to mirror `base/` -# more completely (os.users, os.selinux, os.netplan). -# -# Other differences from base/: -# - No swap or /home partitions (kept simple for the first AZL4 scenario; -# swap support is its own follow-up and /home isn't load-bearing here). -# - No postConfigure sudo grant: the testing-user is added to wheel by -# the testimage MIC config and /etc/sudoers.d/wheel grants nopasswd. +# - No `os:` section: users, SSH keys, networking, and sudoers are +# baked into the test image at MIC build time. Can grow to mirror +# `base/` more completely as AZL4 testing matures. +# - No swap or /home partitions (kept simple for the first AZL4 +# scenario). +# - No postConfigure sudo grant: testing-user is added to wheel by +# the testimage MIC config. From d8a19c1cd97e6c0c506549a4f28143134c6cf91f Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:36:34 -0700 Subject: [PATCH 24/42] docs: Remove PR reference from service connection comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../templates/stages/build_image/build-image-template-azl4.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml index 7b679b0848..db9dda989f 100644 --- a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -50,8 +50,7 @@ parameters: - name: blobServiceConnection type: string # NB: this must be a service connection that exists in the ADO project. - # Trident infra needs to create it manually (Karhu can't); see the PR-5 - # follow-up validation report for the runbook. + # Created manually by trident infra team. default: "trident-azl4-blob-reader" - name: micContainerTag From a03524a05d5eada3795df18735bd8b7d831c2c46 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 14:26:39 -0700 Subject: [PATCH 25/42] tests: Align base-azl4 trident config with AZL3 base Add os.users, os.selinux, os.netplan, swap, /home partitions, and postConfigure sudo grant to match the AZL3 base scenario. Now that osmodifier is built into the trident binary, these features should be tested on AZL4 the same way they are on AZL3. Uses testing-user (matching AZL4 test image and pytest USERNAME). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../base-azl4/trident-config.yaml | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index 11bc1bb095..2192ac99e6 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -18,6 +18,12 @@ storage: - id: esp type: esp size: 1G + - id: swap + type: swap + size: 2G + - id: home + type: home + size: 1G - id: trident type: linux-generic size: 1G @@ -34,19 +40,35 @@ storage: - deviceId: trident source: new mountPoint: /var/lib/trident + - deviceId: home + source: new + mountPoint: /home - deviceId: esp mountPoint: path: /boot/efi options: umask=0077 - deviceId: root mountPoint: / -# AZL4 baseline scenario for the pytest E2E framework. Mirrors the AZL3 -# `base/` scenario with these differences: -# -# - No `os:` section: users, SSH keys, networking, and sudoers are -# baked into the test image at MIC build time. Can grow to mirror -# `base/` more completely as AZL4 testing matures. -# - No swap or /home partitions (kept simple for the first AZL4 -# scenario). -# - No postConfigure sudo grant: testing-user is added to wheel by -# the testimage MIC config. + swap: + - swap +scripts: + postConfigure: + - name: testing-privilege + runOn: + - clean-install + - ab-update + content: echo 'testing-user ALL=(ALL:ALL) NOPASSWD:ALL' > /etc/sudoers.d/testing-user +os: + selinux: + mode: enforcing + netplan: + version: 2 + ethernets: + vmeths: + match: + name: enp* + dhcp4: true + users: + - name: testing-user + sshPublicKeys: [] + sshMode: key-only From 6dbf7da681d24ad5f043e1665d9f98e554798e66 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 21:27:59 -0700 Subject: [PATCH 26/42] tests: Set AZL4 selinux to disabled to validate timeout theory AZL4 Beta may not have complete SELinux policies. Testing whether enforcing mode prevents services from starting after reboot. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../trident_configurations/base-azl4/trident-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index 2192ac99e6..516c5f0a7f 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -60,7 +60,7 @@ scripts: content: echo 'testing-user ALL=(ALL:ALL) NOPASSWD:ALL' > /etc/sudoers.d/testing-user os: selinux: - mode: enforcing + mode: disabled netplan: version: 2 ethernets: From caba721bf264168b1db6860aa94a317b25b6237e Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 9 Jun 2026 15:36:24 -0700 Subject: [PATCH 27/42] tests: Remove netplan from AZL4 config to isolate failure Testing whether netplan (match: enp*) conflicts with the image's eth0 networking (net.ifnames=0) and prevents network after reboot. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../trident_configurations/base-azl4/trident-config.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index 516c5f0a7f..e61f98e23c 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -61,13 +61,6 @@ scripts: os: selinux: mode: disabled - netplan: - version: 2 - ethernets: - vmeths: - match: - name: enp* - dhcp4: true users: - name: testing-user sshPublicKeys: [] From c6d77d3c0dd0e02c14e46428957682cbeaf19423 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 9 Jun 2026 16:28:11 -0700 Subject: [PATCH 28/42] tests: Revert to exact passing config to establish baseline Strip back to the config that passed in build 1133385 to confirm the netlaunch timeout is caused by our additions, not an infra change. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../base-azl4/trident-config.yaml | 27 +------------------ 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index e61f98e23c..9b1797d4ce 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -18,12 +18,6 @@ storage: - id: esp type: esp size: 1G - - id: swap - type: swap - size: 2G - - id: home - type: home - size: 1G - id: trident type: linux-generic size: 1G @@ -40,28 +34,9 @@ storage: - deviceId: trident source: new mountPoint: /var/lib/trident - - deviceId: home - source: new - mountPoint: /home - deviceId: esp mountPoint: path: /boot/efi options: umask=0077 - deviceId: root - mountPoint: / - swap: - - swap -scripts: - postConfigure: - - name: testing-privilege - runOn: - - clean-install - - ab-update - content: echo 'testing-user ALL=(ALL:ALL) NOPASSWD:ALL' > /etc/sudoers.d/testing-user -os: - selinux: - mode: disabled - users: - - name: testing-user - sshPublicKeys: [] - sshMode: key-only + mountPoint: / \ No newline at end of file From 0a454b0fd42b0cb295ee9b603d1345bfdeb4fef7 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 9 Jun 2026 17:20:09 -0700 Subject: [PATCH 29/42] tests: Add only os.users to isolate netlaunch failure Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../trident_configurations/base-azl4/trident-config.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index 9b1797d4ce..b95ef49c37 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -39,4 +39,9 @@ storage: path: /boot/efi options: umask=0077 - deviceId: root - mountPoint: / \ No newline at end of file + mountPoint: / +os: + users: + - name: testing-user + sshPublicKeys: [] + sshMode: key-only \ No newline at end of file From a90f38f89fed5c332457beaec77deeeba245132f Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 9 Jun 2026 18:01:50 -0700 Subject: [PATCH 30/42] tests: Add swap + /home + os.users to isolate failure os.users alone passed. Now testing swap + /home partitions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../base-azl4/trident-config.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index b95ef49c37..c3ba614bee 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -18,6 +18,12 @@ storage: - id: esp type: esp size: 1G + - id: swap + type: swap + size: 2G + - id: home + type: home + size: 1G - id: trident type: linux-generic size: 1G @@ -34,12 +40,17 @@ storage: - deviceId: trident source: new mountPoint: /var/lib/trident + - deviceId: home + source: new + mountPoint: /home - deviceId: esp mountPoint: path: /boot/efi options: umask=0077 - deviceId: root mountPoint: / + swap: + - swap os: users: - name: testing-user From 6412f16a7fabe1cdb289f550d53ff732999fcf81 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 9 Jun 2026 18:43:45 -0700 Subject: [PATCH 31/42] tests: Add swap only (no /home) to isolate partition failure Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../trident_configurations/base-azl4/trident-config.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index c3ba614bee..846a6c1062 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -21,9 +21,6 @@ storage: - id: swap type: swap size: 2G - - id: home - type: home - size: 1G - id: trident type: linux-generic size: 1G @@ -40,9 +37,6 @@ storage: - deviceId: trident source: new mountPoint: /var/lib/trident - - deviceId: home - source: new - mountPoint: /home - deviceId: esp mountPoint: path: /boot/efi From ccc10478f4f1257622ee66470a9fbbc354159886 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 10 Jun 2026 16:31:23 -0700 Subject: [PATCH 32/42] fix: Use testuser in AZL4 COSI to match AZL3 pattern The COSI image user (MIC) must differ from the trident config user (os.users) to avoid /home mount conflict. AZL3 uses testuser in the COSI and testing-user in the trident config. AZL4 was using testing-user in both, causing 'Mount path /mnt/newroot/home is not empty' during install. Also restore full test config (swap, /home, os.users, os.selinux, os.netplan) and fix netplan match from enp* to eth* (AZL4 uses net.ifnames=0). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../base-azl4/trident-config.yaml | 22 +++++++++++++++++++ .../base/updateimg-grub-azl4.yaml | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index 846a6c1062..bfa96396b9 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -21,6 +21,9 @@ storage: - id: swap type: swap size: 2G + - id: home + type: home + size: 1G - id: trident type: linux-generic size: 1G @@ -37,6 +40,9 @@ storage: - deviceId: trident source: new mountPoint: /var/lib/trident + - deviceId: home + source: new + mountPoint: /home - deviceId: esp mountPoint: path: /boot/efi @@ -45,7 +51,23 @@ storage: mountPoint: / swap: - swap +scripts: + postConfigure: + - name: testing-privilege + runOn: + - clean-install + - ab-update + content: echo 'testing-user ALL=(ALL:ALL) NOPASSWD:ALL' > /etc/sudoers.d/testing-user os: + selinux: + mode: disabled + netplan: + version: 2 + ethernets: + vmeths: + match: + name: eth* + dhcp4: true users: - name: testing-user sshPublicKeys: [] diff --git a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml index 9cee3c8096..e12db0ad7b 100644 --- a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml +++ b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml @@ -82,7 +82,7 @@ os: # belt-and-braces creates the multi-user.target.wants symlink. - trident.service users: - - name: testing-user + - name: testuser sshPublicKeyPaths: - files/id_rsa.pub secondaryGroups: From 107dffd79f54f101d6b4d1260f7dd8f4660a04c4 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 10 Jun 2026 22:27:44 -0700 Subject: [PATCH 33/42] tests: Reduce AZL4 COSI ESP from 64M to 16M COSI ESP only stores one set of boot files (~7MB). 64M was unnecessarily large. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml index e12db0ad7b..89c78ba800 100644 --- a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml +++ b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml @@ -5,7 +5,7 @@ storage: partitionTableType: gpt partitions: - id: esp - size: 64M + size: 16M type: esp - id: root size: 4G From e29e0cce827166aa438147fce975a7a7c637ae3b Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 10 Jun 2026 22:32:19 -0700 Subject: [PATCH 34/42] tests: Remove /home partition from AZL4 host-mode config The COSI bakes /home/testuser onto root via MIC os.users. Trident's newroot mount rejects non-empty mount points, so a separate /home partition conflicts. AZL3 avoids this by only testing /home in container mode. Container mode for AZL4 is a follow-up. Keep swap, os.users, os.selinux, os.netplan, postConfigure. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../base-azl4/trident-config.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml index bfa96396b9..c918e15151 100644 --- a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -21,9 +21,6 @@ storage: - id: swap type: swap size: 2G - - id: home - type: home - size: 1G - id: trident type: linux-generic size: 1G @@ -40,9 +37,6 @@ storage: - deviceId: trident source: new mountPoint: /var/lib/trident - - deviceId: home - source: new - mountPoint: /home - deviceId: esp mountPoint: path: /boot/efi @@ -51,6 +45,12 @@ storage: mountPoint: / swap: - swap +# /home partition omitted: the COSI bakes a user home directory onto +# root via MIC os.users. Trident's newroot mount rejects non-empty +# mount points, so a separate /home partition conflicts with the +# pre-existing /home/. AZL3 avoids this by only testing /home +# in container mode. Container mode support for AZL4 is tracked as +# a follow-up. scripts: postConfigure: - name: testing-privilege From 2d770b140fdc81d64c6ea24a8915363608d7a5ef Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:21:02 -0700 Subject: [PATCH 35/42] tests: Add AZL4 BM-simulated netlaunch test stage Adds AZL4 bare-metal simulated netlaunch pipeline stage and SELinux xattr stripping script for test image prep. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../testing_vm/netlaunch-testing-azl4.yml | 364 ++++++++++++++++++ .../base/scripts/strip-selinux-xattrs.sh | 85 ++++ 2 files changed, 449 insertions(+) create mode 100644 .pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml create mode 100644 tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh diff --git a/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml new file mode 100644 index 0000000000..03dd6cab9c --- /dev/null +++ b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml @@ -0,0 +1,364 @@ +# AZL4 BM-simulated netlaunch test stage. +# +# Drives an AZL3 MOS installer ISO + AZL4 COSI through netlaunch to validate +# that the AZL4 COSI (from PR-5) can be installed by Trident onto a fresh +# virtdeploy VM. This is the BM-simulated path: Trident runs from the live +# MOS environment (AZL3), `trident install` partitions the disk and streams +# the AZL4 COSI to it, the target boots into AZL4. +# +# Differences from netlaunch-testing.yml: +# * No test matrix. Hardcoded to the `base-azl4` configuration in +# tests/e2e_tests/trident_configurations/. +# * Host runtimeEnv only. Container variant is a follow-on. +# * No ACR push. The AZL4 COSI is served locally by netlaunch. +# * No SELinux check. AZL4 SELinux integration is its own follow-on. +# * No matrix-driven test execution after install. First iteration only +# validates that the VM provisions and is reachable over SSH. +# +# TODO(azl4-merge-back): Once AZL4 has a published trident-service RPM and +# all the bits below (SELinux, container path, metrics) are wired up for +# AZL4, fold this back into netlaunch-testing.yml as an additional matrix +# entry. + +parameters: + - name: installerISOArtifact + type: string + # AZL3 MOS ISO is the live OS Trident runs from. It does not need to + # match the target OS version since the target comes from the COSI. + default: "trident-installer" + + - name: cosiArtifact + type: string + # Artifact published by stages/build_image/build-image-azl4.yml. The + # actual COSI file inside is trident-vm-grub-testimage-azl4.cosi. + default: "trident-vm-grub-testimage-azl4" + + - name: tridentConfiguration + type: string + # Lives at tests/e2e_tests/trident_configurations/base-azl4/. + default: "base-azl4" + + - name: dependsOnStage + type: string + default: "" + +stages: + - stage: NetlaunchTesting_AZL4 + displayName: Netlaunch Testing - AZL4 (BM-simulated) + dependsOn: + - BuildingTools + - PrepareSSHKeys + - TridentTestImg_trident_installer + - TridentTestImg_trident_vm_grub_testimage_azl4 + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + + jobs: + - job: NetlaunchAzl4 + displayName: Netlaunch (AZL3 ISO + AZL4 COSI) + timeoutInMinutes: 30 + pool: + type: linux + name: trident-ubuntu-1es-pool-eastus2 + hostArchitecture: amd64 + + variables: + - name: ob_outputDirectory + value: /tmp/deployment_logs_azl4 + - name: ob_artifactBaseName + value: "netlaunch-testing-azl4" + + - name: tridentConfigPath + value: tests/e2e_tests/trident_configurations/${{ parameters.tridentConfiguration }} + + - name: netlaunchPort + value: 4001 + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL3 installer ISO" + inputs: + buildType: current + artifactName: "${{ parameters.installerISOArtifact }}" + targetPath: "$(TRIDENT_SOURCE_DIR)/artifacts/iso" + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 COSI" + inputs: + buildType: current + artifactName: "${{ parameters.cosiArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/cosi-azl4" + + # PrepareSSHKeys produces the shared 'ssh-keys' artifact whose + # id_rsa.pub is baked into the AZL4 COSI at MIC build time (see + # build-image-template-azl4.yml). The matching private key + # `ssh-keys/id_rsa` is what we use locally to SSH into the + # post-install AZL4 VM. Until 2026-05-17 we generated a fresh + # per-build keypair inside testimages.py and published the + # private half alongside the COSI, but the qcow2 + cosi builds + # for VM-testing need to share a key (the same VM A/B-updates + # from qcow2 to cosi), so we standardized on the shared artifact. + - task: DownloadPipelineArtifact@2 + displayName: "Download shared SSH key" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh-keys" + + - task: DownloadPipelineArtifact@2 + displayName: "Download go-tools" + inputs: + buildType: current + artifactName: "go-tools" + patterns: | + netlaunch + netlisten + storm-trident + virtdeploy + targetPath: "$(TRIDENT_SOURCE_DIR)/bin" + + # Install libvirt / qemu / OVMF and configure libvirt access. Without + # this, virt-deploy fails creating bridge interfaces ("Operation not + # permitted") on the OneBranch Ubuntu runner. + - template: netlaunch-prep.yml + + # NOTE: we intentionally do NOT run testing_common/trident-prep.yml. + # That template runs edit_host_config.py, which injects the test + # SSH key into trident-config's os.users section. The AZL4 + # `base-azl4` trident-config omits the os: section entirely + # because the AZL3 MOS installer ISO has no /usr/bin/osmodifier, + # so trident can't drive os.users at install time. Instead we + # use the per-image SSH key that testimages.py baked into the + # AZL4 COSI at MIC time (set up below). + + - bash: | + set -euxo pipefail + + chmod +x "$(TRIDENT_SOURCE_DIR)"/bin/{netlaunch,netlisten,storm-trident,virtdeploy} + + # Stage the AZL4 COSI as regular.cosi where netlaunch will + # serve it. The trident-config for base-azl4 references + # http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi. + SERVE_DIR="$(TRIDENT_SOURCE_DIR)/artifacts/test-image" + mkdir -p "$SERVE_DIR" + + # The artifact may contain the file with the imageName as + # prefix; tolerate both layouts. The clone-index suffix + # (`_0.cosi`) is what testimages.py produces when called + # with the default --clones >= 1. + COSI_SRC="" + for candidate in \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/trident-vm-grub-testimage-azl4_0.cosi" \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/trident-vm-grub-testimage-azl4.cosi" \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/regular.cosi"; do + if [ -f "$candidate" ]; then + COSI_SRC="$candidate" + break + fi + done + + if [ -z "$COSI_SRC" ]; then + echo "Could not find AZL4 COSI. Artifact contents:" + find "$(Build.ArtifactStagingDirectory)/cosi-azl4" -type f | head -20 + exit 1 + fi + + cp "$COSI_SRC" "$SERVE_DIR/regular.cosi" + ls -alh "$SERVE_DIR" + + # Install the shared SSH private key (from the + # PrepareSSHKeys artifact) as the test framework's + # helpers/key. Its matching public key was baked into the + # AZL4 COSI at MIC build time, so post-install we can SSH + # into the target as testing-user with this key. + KEY_SRC="$(Build.ArtifactStagingDirectory)/ssh-keys/id_rsa" + if [ ! -f "$KEY_SRC" ]; then + echo "Could not find shared SSH key at $KEY_SRC. Artifact contents:" + find "$(Build.ArtifactStagingDirectory)/ssh-keys" -type f + exit 1 + fi + cp "$KEY_SRC" "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + chmod 600 "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + # Convert to PEM if not already (matches what trident-prep + # does for AZL3 keys). + # Convert the per-build SSH key to PEM if it isn't already. + # `ssh-keygen -p -P "" -N "" -m PEM -f ...` is a no-op on + # already-PEM keys and explicitly tells ssh-keygen that + # the existing passphrase is empty (so it doesn't read + # stdin if it can't guess). + ssh-keygen -p -P "" -N "" -m PEM -f "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + ls -alh "$(TRIDENT_SOURCE_DIR)/artifacts/iso" + ls -alh "$(TRIDENT_SOURCE_DIR)/bin" + displayName: "Stage AZL4 COSI as regular.cosi" + + - bash: | + set -eux + # Disable virtlogd rollover so we keep full logs. + echo "max_size = 0" | sudo tee -a /etc/libvirt/virtlogd.conf + sudo systemctl restart virtlogd.socket + + ./tools/virt-deploy create --mem 12 --disks 32,32 + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "Create virt-deploy VM" + + - bash: | + set -euxo pipefail + + TRIDENT_CONFIG="$(TRIDENT_SOURCE_DIR)/$(tridentConfigPath)/trident-config.yaml" + + # Run netlaunch in the background so we can monitor its log + # for the install-success marker. The base-azl4 cosi does + # not yet ship trident systemd units (trident.service, + # tridentd.socket) so the installed AZL4 system never + # phones home post-reboot. netlaunch's ListenLoop always + # waits for at least one phone-home regardless of flags, + # so we treat trident's own "Rebooting system" log line + # (emitted by crates/trident/src/reboot.rs from the install + # success path) as our completion signal and terminate + # netlaunch cleanly. Phone-home wiring belongs with the + # VM-testing work where the trident systemd stack lands. + # + # netlaunch is launched with `setsid` so we can signal the + # whole process group on shutdown — otherwise the child + # HTTP/TFTP server processes get reparented to PID 1 and + # may leak ports / qcow2 file locks to the next job on the + # same agent. + setsid ./bin/netlaunch \ + --iso ./artifacts/iso/${{ parameters.installerISOArtifact }}.iso \ + --config $(TRIDENT_SOURCE_DIR)/tools/vm-netlaunch.yaml \ + --trident "$TRIDENT_CONFIG" \ + --servefolder ./artifacts/test-image \ + --logstream \ + --force-color \ + --full-logstream logstream-full.log \ + --only-print-exit-code \ + --port $(netlaunchPort) > ./clean-install-azl4.log 2>&1 & + NETLAUNCH_PID=$! + NETLAUNCH_PGID="$NETLAUNCH_PID" + echo "netlaunch pid: $NETLAUNCH_PID (pgid $NETLAUNCH_PGID)" + + # Watch for the install-success marker for up to 12 minutes. + # Real install completes in 1-3 minutes once netlaunch + # finishes booting the MOS ISO via HTTP boot. UEFI HTTP + # boot can occasionally need 5+ minutes, so 12 minutes is + # generous. + # + # The marker regex is tightened to trident's own + # log-record prefix to avoid false-positives from any + # kernel / systemd / dracut "Restarting system" line that + # might fire on an error path before trident itself + # actually completes. + REBOOT_RE='trident[^[:space:]]*[[:space:]]+(INFO|WARN)[[:space:]].*Rebooting system' + FATAL_RE='kernel panic|dracut:.*FATAL|Emergency mode|emergency!' + DEADLINE=$((SECONDS + 720)) + INSTALL_OK=0 + while [ $SECONDS -lt $DEADLINE ]; do + if grep -Eq "$FATAL_RE" ./clean-install-azl4.log 2>/dev/null; then + echo "FATAL marker observed before install success — aborting" + break + fi + if ! kill -0 $NETLAUNCH_PID 2>/dev/null; then + echo "netlaunch exited on its own" + if wait $NETLAUNCH_PID; then + INSTALL_OK=1 + fi + break + fi + if grep -Eq "$REBOOT_RE" ./clean-install-azl4.log 2>/dev/null; then + echo "install completed (saw trident 'Rebooting system' marker)" + INSTALL_OK=1 + break + fi + sleep 10 + done + + # Always show the netlaunch log tail for diagnostics + echo "--- netlaunch log tail ---" + tail -50 ./clean-install-azl4.log || true + + if [ $INSTALL_OK -eq 1 ]; then + echo "Killing netlaunch process group (install completed; not waiting for phone-home)" + # SIGTERM the whole group; netlaunch's children include + # an HTTP/TFTP server we need to release the port on. + kill -TERM -"$NETLAUNCH_PGID" 2>/dev/null || true + # Generous grace so --full-logstream finishes flushing + # to logstream-full.log (which the failure-diagnostic + # display-logs step uploads). + for _ in 1 2 3 4 5 6 7 8 9 10; do + if ! kill -0 $NETLAUNCH_PID 2>/dev/null; then + break + fi + sleep 1 + done + kill -KILL -"$NETLAUNCH_PGID" 2>/dev/null || true + wait $NETLAUNCH_PID 2>/dev/null || true + exit 0 + fi + + echo "Install marker not observed within timeout (or fatal seen)" + kill -TERM -"$NETLAUNCH_PGID" 2>/dev/null || true + sleep 5 + kill -KILL -"$NETLAUNCH_PGID" 2>/dev/null || true + exit 1 + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "🚀 Run netlaunch (AZL3 ISO installs AZL4 COSI)" + # 14 minutes covers the 12-minute install-success watcher + # plus a couple minutes of slack. + timeoutInMinutes: 14 + + - bash: | + set -eux + sudo ./bin/storm-trident helper wait-for-login -a \ + --vm-name "$(jq -r '.virtualmachines[0].name' $(TRIDENT_SOURCE_DIR)/tools/virt-deploy-metadata.json)" \ + --artifacts-folder "$(ob_outputDirectory)" + timeoutInMinutes: 5 + # `succeeded()` (not `succeededOrFailed()`) so a failed + # SSH-up after a "successful" netlaunch actually fails the + # stage. Combined with the tightened install marker above, + # this closes the structural bias-toward-green where the + # netlaunch wrapper could exit 0 on a false-positive log + # line and let everything downstream gloss over the failure. + condition: succeeded() + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "📄 Wait for target OS to be reachable" + + - bash: | + set -eux + ./bin/storm-trident script capture-screenshot \ + --screenshot-filename "install-azl4.png" \ + --artifacts-folder "$(ob_outputDirectory)" + displayName: "📷 Capture screenshot" + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: succeededOrFailed() + + - bash: | + set -eux + sudo ./bin/storm-trident helper display-logs -a \ + --serial-log-artifact-file-name "azl4-install-target-os-A-serial.log" \ + --trident-trace-log-file "$(TRIDENT_SOURCE_DIR)/logstream-full.log" \ + --artifacts-folder "$(ob_outputDirectory)" + displayName: "📄 Display install logs" + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: succeededOrFailed() + + - bash: | + set -eux + sudo virsh shutdown virtdeploy-vm-0 || true + mkdir -p $(ob_outputDirectory) + sudo cp /var/lib/libvirt/images/virtdeploy-pool/virtdeploy-vm-0-0-volume.qcow2 $(ob_outputDirectory)/ || true + sudo zstd -T0 $(ob_outputDirectory)/virtdeploy-vm-0-0-volume.qcow2 || true + sudo cp $(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key $(ob_outputDirectory) || true + # Owner-only readable. Previously this was `chmod 777` + # which produced a SARIF-flaggable artifact even though + # the key is per-build ephemeral. + sudo chmod 600 $(ob_outputDirectory)/key || true + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: failed() + displayName: "Publish OS disk on failure" + + - template: ../testing_common/fix-output-directory-for-one-branch-step.yml + parameters: + outputDir: $(ob_outputDirectory) + condition: always() diff --git a/tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh b/tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh new file mode 100644 index 0000000000..aaa8f38440 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Strip security.selinux xattrs from every file in the cosi. +# +# Background: AZL4's base VHDX is built by the upstream Azure Linux build +# process with SELinux file labels baked in (e.g. system_u:object_r:lib_t:s0). +# Even though this test image sets `selinux: mode: disabled`, MIC does not +# strip the inherited xattrs — `mode: disabled` only controls boot-time +# SELinux state. +# +# These labels become a problem when Trident installs this cosi from the +# AZL3 MOS environment: MOS boots with selinux=1 enforcing=0, loads its +# AZL3 policy, and dracut (running inside the chroot) tries to preserve +# the AZL4 labels via cp -a. The MOS-side SELinux LSM validates the +# context being written and rejects labels not in its policy. dracut +# cascades through hundreds of "cp: setting attribute 'security.selinux': +# Permission denied" errors, eventually fatally on dracut-install's ldd +# step. +# +# Stripping the xattrs at cosi build time sidesteps this entirely: +# - During MIC build, SELinux is not loaded inside the chroot, so +# setfattr -x works without policy interference. +# - During Trident install in MOS, cp -a finds no security.selinux to +# preserve and skips the setxattr call. +# - On first boot of the installed AZL4 OS, files get auto-relabeled if +# SELinux is enabled (which our test config disables anyway). +# +# Once AZL4 is the install/target environment for everything (no AZL3 MOS +# bridging it), this script can be removed. + +set -euo pipefail + +echo "Stripping security.selinux xattrs from rootfs..." + +# Walk every regular file, symlink, and directory across all filesystems +# under /. `find -xdev` would skip separately-mounted filesystems like +# `/boot` and `/var` that MIC commonly composes with — and `/boot` +# specifically carries SELinux labels on the kernel image and initramfs, +# which is exactly what dracut touches during AZL3 MOS install of the +# AZL4 cosi. So we walk the whole tree and only prune the virtual +# filesystems where xattrs don't make sense (`/proc`, `/sys`, `/dev`, +# `/run`). +# +# `setfattr` follows symlinks by default; `-h` makes it operate on the +# symlink itself, which is what we want here. +count=0 +fail_count=0 +while IFS= read -r -d '' f; do + # Capture stderr so we can distinguish ENODATA ("no such attribute", + # benign — nothing to strip) from real failures (EPERM, EOPNOTSUPP). + err=$(setfattr -h -x security.selinux "$f" 2>&1 >/dev/null) || rc=$? && rc=${rc:-0} + if [ "$rc" -eq 0 ]; then + count=$((count + 1)) + elif echo "$err" | grep -qE "No such attribute|Operation not supported"; then + : # nothing to strip, expected for files without the xattr + else + fail_count=$((fail_count + 1)) + echo "setfattr failed on '$f': $err" >&2 + fi + rc=0 +done < <(find / \( -path /proc -o -path /sys -o -path /dev -o -path /run \) -prune \ + -o \( -type f -o -type d -o -type l \) -print0) + +echo "Stripped security.selinux from ${count} files/dirs" + +if [ "$fail_count" -gt 0 ]; then + echo "ERROR: setfattr failed (non-ENODATA) on ${fail_count} entries" >&2 + exit 1 +fi + +# Verify the strip actually took effect by scanning a representative set +# of paths (rootfs, /boot if present, /usr/lib/systemd, /etc). Any +# residual security.selinux means we missed something — fail loudly +# rather than warning, since the whole point of the script is to leave +# the image bare. +sentinel_dirs=( "/etc" "/usr/lib/systemd" "/usr/bin" ) +if [ -d /boot ]; then + sentinel_dirs+=( "/boot" ) +fi +for d in "${sentinel_dirs[@]}"; do + if getfattr -R -m security.selinux "$d" 2>/dev/null | grep -q security.selinux; then + echo "ERROR: security.selinux xattr still present under '$d'" >&2 + getfattr -R -m security.selinux "$d" 2>/dev/null | head -10 >&2 + exit 1 + fi +done From 53020d5c62e07d175a44a193e6169c2bc0cefb54 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:37:30 -0700 Subject: [PATCH 36/42] docs: Remove PR reference from netlaunch comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../stages/testing_vm/netlaunch-testing-azl4.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml index 03dd6cab9c..77cd57803c 100644 --- a/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml +++ b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml @@ -1,10 +1,10 @@ # AZL4 BM-simulated netlaunch test stage. # # Drives an AZL3 MOS installer ISO + AZL4 COSI through netlaunch to validate -# that the AZL4 COSI (from PR-5) can be installed by Trident onto a fresh -# virtdeploy VM. This is the BM-simulated path: Trident runs from the live -# MOS environment (AZL3), `trident install` partitions the disk and streams -# the AZL4 COSI to it, the target boots into AZL4. +# that the AZL4 COSI can be installed by Trident onto a fresh virtdeploy VM. +# Trident runs from the live MOS environment (AZL3), `trident install` +# partitions the disk and streams the AZL4 COSI to it, the target boots +# into AZL4. # # Differences from netlaunch-testing.yml: # * No test matrix. Hardcoded to the `base-azl4` configuration in From 3dad740a59f24d51f0fbec8e8b59777ea940b90c Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:44 -0700 Subject: [PATCH 37/42] engineering: Add AZL4 qcow2 base image, offline-init, sfdisk hardening Adds sfdisk partition-table helper, extended offline-init for AZL4 qcow2 images, base image COSI config, and test helper scripts. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/sfdisk.rs | 55 +++++ crates/trident/src/init/offline/mod.rs | 146 +++++++++++-- tests/e2e_tests/base_test.py | 3 + .../base/baseimg-grub-azl4.yaml | 195 ++++++++++++++++++ .../scripts/enable-trident-service-azl4.sh | 35 ++++ .../base/scripts/rebuild-initrd-azl4.sh | 62 ++++++ .../base/scripts/update-host-status-azl4.sh | 16 ++ 7 files changed, 498 insertions(+), 14 deletions(-) create mode 100644 tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml create mode 100644 tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh create mode 100644 tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh create mode 100644 tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh diff --git a/crates/osutils/src/sfdisk.rs b/crates/osutils/src/sfdisk.rs index 81eef21c71..f40276ad95 100644 --- a/crates/osutils/src/sfdisk.rs +++ b/crates/osutils/src/sfdisk.rs @@ -197,6 +197,61 @@ pub fn get_disk_uuid(disk: &Path) -> Result, Error> { Ok(Some(uuid)) } +/// Sets the disk-id (GPT header DiskGUID) of the given disk via sfdisk. +/// +/// `uuid` must parse as a valid GUID; this is checked before invoking +/// sfdisk so an accidental flag-like string (e.g. `--foo`) is rejected +/// here rather than mis-interpreted by sfdisk as an option. +/// +/// `--no-reread` + `--no-tell-kernel` are passed because the typical +/// caller is `trident offline-initialize` inside MIC's chroot, where +/// the disk's partitions are bind-mounted into the chroot. Requesting +/// `BLKRRPART` on a disk with mounted partitions returns EBUSY; we +/// only care about updating the on-disk GPT here. +pub fn set_disk_uuid(disk: &Path, uuid: &str) -> Result<(), Error> { + uuid::Uuid::parse_str(uuid) + .with_context(|| format!("'{uuid}' is not a valid GUID for sfdisk --disk-id"))?; + Dependency::Sfdisk + .cmd() + .arg("--no-reread") + .arg("--no-tell-kernel") + .arg("--disk-id") + .arg(disk) + .arg(uuid) + .run_and_check() + .context(format!( + "Failed to set disk-id on {} to {uuid}", + disk.display() + ))?; + Ok(()) +} + +/// Sets the GPT partition UUID for a specific partition by number on the +/// given disk. +/// +/// `uuid` is validated as a GUID first to avoid sfdisk mis-interpreting +/// a flag-like argument. `--no-reread` / `--no-tell-kernel` mirror +/// [`set_disk_uuid`] for safety inside MIC chroots with mounted +/// partitions. +pub fn set_part_uuid(disk: &Path, partition_number: usize, uuid: &str) -> Result<(), Error> { + uuid::Uuid::parse_str(uuid) + .with_context(|| format!("'{uuid}' is not a valid GUID for sfdisk --part-uuid"))?; + Dependency::Sfdisk + .cmd() + .arg("--no-reread") + .arg("--no-tell-kernel") + .arg("--part-uuid") + .arg(disk) + .arg(partition_number.to_string()) + .arg(uuid) + .run_and_check() + .context(format!( + "Failed to set partition UUID on {} partition {partition_number} to {uuid}", + disk.display() + ))?; + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/trident/src/init/offline/mod.rs b/crates/trident/src/init/offline/mod.rs index cdbeee23d6..ef3d038721 100644 --- a/crates/trident/src/init/offline/mod.rs +++ b/crates/trident/src/init/offline/mod.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Error}; use log::{debug, info, trace, warn}; use maplit::hashmap; -use osutils::lsblk; +use osutils::{lsblk, sfdisk}; use sysdefs::partition_types::DiscoverablePartitionType; use trident_api::{ config::{ @@ -256,22 +256,131 @@ fn generate_host_status( .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) .message("Failed to find root device in lsblk output")?; - let disk_uuid = lsblk_device + let disk_uuid = match lsblk_device .ptuuid .clone() .and_then(|ptuuid| ptuuid.as_uuid()) - .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) - .message("No UUID found for root device")?; + { + Some(uuid) => uuid, + None => { + // lsblk didn't surface a PTUUID. This can happen in chroot + // environments (e.g. image-customizer / MIC) where the + // exposed loop device has partition children but the GPT + // disk-id either isn't set on the partition table or isn't + // populated by lsblk's PTUUID column. Fall back to sfdisk + // (which reads the GPT directly), and if that also reports + // no disk-id, mint one and persist it so the resulting + // image carries it forward to runtime. + let disk_dev_path = PathBuf::from("/dev").join(&lsblk_device.name); + warn!( + "PTUUID not reported by lsblk for {}; falling back to sfdisk", + disk_dev_path.display() + ); + let from_sfdisk = sfdisk::get_disk_uuid(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message("Failed to read GPT disk-id via sfdisk")? + .and_then(|u| u.as_uuid()); + match from_sfdisk { + Some(uuid) => uuid, + None => { + let new_uuid = uuid::Uuid::new_v4(); + warn!( + "No GPT disk-id present on {}; assigning {}", + disk_dev_path.display(), + new_uuid + ); + sfdisk::set_disk_uuid(&disk_dev_path, &new_uuid.to_string()) + .structured( + ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, + ) + .message(format!( + "Failed to assign GPT disk-id on {}", + disk_dev_path.display() + ))?; + new_uuid + } + } + } + }; lsblk_device.children.sort_by_key(|p| p.partn); - for (i, part) in lsblk_device.children.iter().enumerate() { - if part.part_uuid.is_none() { - return Err(TridentError::new( - ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, - )) - .message(format!("No part UUID found for partition {}", i + 1)); + // Compute disk_dev_path once for partition-UUID fallback below. + let disk_dev_path = PathBuf::from("/dev").join(&lsblk_device.name); + + // For each partition, ensure we have a usable PARTUUID. Mirror the + // disk-id fallback above: prefer lsblk, then sfdisk, then mint a + // fresh one and persist it via sfdisk. Some chroot environments + // don't surface PARTUUID via lsblk --output-all and may also leave + // the value unset on the underlying GPT. + for (i, part) in lsblk_device.children.iter_mut().enumerate() { + if part.part_uuid.as_ref().and_then(|u| u.as_uuid()).is_some() { + continue; } + let partn = part.partn.unwrap_or((i + 1) as u32) as usize; + warn!( + "PARTUUID not reported by lsblk for partition {} on {}; falling back to sfdisk", + partn, + disk_dev_path.display() + ); + // Re-read the disk via sfdisk -J to find any UUID already present + // on this partition (sfdisk reads the GPT directly). + let sf_info = sfdisk::SfDisk::get_info(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to read GPT info via sfdisk for {}", + disk_dev_path.display() + ))?; + if let Some(existing) = sf_info + .partitions + .iter() + .find(|p| p.number == partn) + .and_then(|p| p.id.as_uuid()) + { + // Use the canonical form sfdisk reported, not a re-rendered + // copy — sfdisk normalizes UUIDs to upper-case on disk and + // downstream /dev/disk/by-partuuid/ lookups must match. + part.part_uuid = Some(existing.to_string().into()); + continue; + } + + let new_uuid = uuid::Uuid::new_v4(); + warn!( + "Partition {} on {} has no PARTUUID; assigning {}", + partn, + disk_dev_path.display(), + new_uuid + ); + sfdisk::set_part_uuid(&disk_dev_path, partn, &new_uuid.to_string()) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to assign PARTUUID on partition {} of {}", + partn, + disk_dev_path.display() + ))?; + + // Re-read to get sfdisk's canonical on-disk form (upper-case) + // rather than stamping our locally-generated lower-case Uuid. + // Avoids a subtle case-mismatch with udev's + // /dev/disk/by-partuuid/ symlinks. + let written_uuid = sfdisk::SfDisk::get_info(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to re-read GPT info via sfdisk for {} after writing partition UUID", + disk_dev_path.display() + ))? + .partitions + .iter() + .find(|p| p.number == partn) + .and_then(|p| p.id.as_uuid()) + .ok_or_else(|| { + TridentError::new(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + }) + .message(format!( + "sfdisk reported no PARTUUID for partition {} after writing {}", + partn, new_uuid + ))?; + part.part_uuid = Some(written_uuid.to_string().into()); } // Get partition paths created from combining Prism history and lsblk output. @@ -494,12 +603,21 @@ pub fn execute( trace!("Prism history contents:\n{history_file}"); + // Note: `disk` is the *runtime* device path that will be written + // into the datastore (e.g. /dev/sda). At build time inside Prism's + // chroot, this path generally does not exist because the disk is + // exposed as a loop device (the actual build-time device is + // auto-detected below by walking lsblk for the mount at "/"). + // Older code asserted that `disk` exist at build time, but that + // check tested the wrong invariant and broke AZL4 image builds + // where MIC does not bind a /dev/sda node into the chroot. let disk_path = Path::new(disk); if !disk_path.exists() { - return Err(TridentError::new( - ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, - )) - .message(format!("Prism chroot environment doesn't contain {disk}")); + debug!( + "Runtime disk path {} not present in build environment; \ + this is expected when running inside MIC's chroot.", + disk_path.display() + ); } let history: Vec = diff --git a/tests/e2e_tests/base_test.py b/tests/e2e_tests/base_test.py index 3e60948054..884e2dcec1 100644 --- a/tests/e2e_tests/base_test.py +++ b/tests/e2e_tests/base_test.py @@ -421,6 +421,9 @@ def test_users(connection, hostConfiguration): expected_users = list() expected_groups = dict() + if "os" not in hostConfiguration or "users" not in hostConfiguration.get("os", {}): + pytest.skip("No os.users in trident config (user baked into image by MIC)") + for user_info in hostConfiguration["os"]["users"]: expected_users.append(user_info["name"]) if "groups" in user_info: diff --git a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml new file mode 100644 index 0000000000..c601b34d9d --- /dev/null +++ b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml @@ -0,0 +1,195 @@ +# Base image config for trident-vm-grub-testimage-azl4. +# +# This builds the BOOTABLE base qcow2 that storm-trident rollback tests +# start the VM from. After this qcow2 boots, trident is installed and ready +# to drive A/B updates to the .cosi produced by updateimg-grub-azl4.yaml. +# +# Layout mirrors AZL3's baseimg-grub.yaml (A/B partitions) but uses +# AZL4-specific package names (dnf5, grub2-efi-x64, shim, etc.) matching +# the updateimg-grub-azl4.yaml flavor. +# +# TODO(azl4-rpm): Drop the trident + osmodifier additionalFiles entries +# once an AZL4 trident-service RPM and azurelinux-image-tools-osmodifier +# RPM are published. Until then we bake the binaries inline. + +storage: + disks: + - partitionTableType: gpt + maxSize: 10G + partitions: + - id: esp + type: esp + # 64M (vs AZL3's 16M) because AZL4 ships larger grub/shim + # binaries (~5MB grubx64.efi) and trident's offline-init + # copies them to both /boot/efi/EFI/AZLA and /AZLB. + size: 64M + + - id: root-a + size: 4G + + - id: root-b + size: 4G + + - id: trident + size: 1G + + - id: srv + size: grow + + bootType: efi + + filesystems: + - deviceId: esp + type: fat32 + mountPoint: + path: /boot/efi + options: umask=0077 + + - deviceId: root-a + type: ext4 + mountPoint: / + + - deviceId: trident + type: ext4 + mountPoint: /var/lib/trident + + - deviceId: srv + type: ext4 + mountPoint: /srv + +os: + bootloader: + resetType: hard-reset + hostname: trident-vm-testimg + + selinux: + mode: disabled + + kernelCommandLine: + # Mirrors AZL3 baseimg-grub.yaml; same console + debug settings so + # serial output works the same on both flavors. `net.ifnames=0` + # keeps interface naming as eth0/eth1/... so the + # `99-dhcp-eth0.network` systemd-networkd config matches the only + # virtio NIC the qemu test VM ships with. + extraCommandLine: + - console=tty0 + - console=tty1 + - console=ttyS0 + - net.ifnames=0 + - rd.debug + - loglevel=6 + - log_buf_len=1M + - systemd.journald.forward_to_console=1 + + packages: + install: + # AZL4 equivalents of the AZL3 set. See updateimg-grub-azl4.yaml + # for the rationale on each substitution. + - curl + - dnf5 + - efibootmgr + - grub2-efi-x64 + - grub2-efi-x64-modules + - grub2-tools + - grub2-tools-efi + - iproute + - iptables-nft + - jq + - lsof + - netplan + - openssh-server + - shim + - sudo + - systemd-networkd + - systemd-resolved + - vim + + services: + enable: + - sshd + - systemd-networkd + - systemd-resolved + # Trident socket-activated daemon. Storm-trident drives all + # update/commit/rollback through `trident grpc-client ...` which + # talks to this socket. + - tridentd.socket + # Oneshot trident commit at boot. Marks A/B update commits when + # they complete after reboot. + - trident.service + + additionalFiles: + # TODO(azl4-rpm): replace these binary copies and unit-file copies + # with `packages.install: - trident-service` once the RPM is + # published for AZL4. + - source: trident-bin/trident + destination: /usr/bin/trident + permissions: "755" + # TODO(azl4-osmodifier-rpm): replace with + # `packages.install: - azurelinux-image-tools-osmodifier` + # once the RPM is published. + - source: osmodifier-bin/osmodifier + destination: /usr/bin/osmodifier + permissions: "755" + + # Trident systemd units. AZL3 gets these from the trident-service + # RPM; AZL4 doesn't have that RPM yet so we ship them inline. The + # contents come straight from packaging/systemd/ in this repo so a + # source change requires a re-build of the qcow2 to pick up. + - source: ../../../../packaging/systemd/trident.service + destination: /usr/lib/systemd/system/trident.service + - source: ../../../../packaging/systemd/tridentd.service + destination: /usr/lib/systemd/system/tridentd.service + - source: ../../../../packaging/systemd/tridentd.socket + destination: /usr/lib/systemd/system/tridentd.socket + + # AZL4 lacks a /usr/bin/hostname binary; the pytest framework + # smoke-tests SSH with `hostname`, so we ship a tiny shim. + - source: files/hostname-shim.sh + destination: /usr/local/bin/hostname + permissions: "755" + - source: files/sudoers-wheel + destination: /etc/sudoers.d/wheel + - source: files/99-dhcp-eth0.network + destination: /etc/systemd/network/99-dhcp-eth0.network + - source: files/regen-sshd-keys.service + destination: /etc/systemd/system/regen-sshd-keys.service + + users: + - name: testing-user + sshPublicKeyPaths: + - files/id_rsa.pub + secondaryGroups: + - wheel + +scripts: + postCustomization: + # Mirrors AZL3's baseimg-grub.yaml ordering: post-install runs + # first, then we bake the trident datastore at build time (so first + # boot is fast and storm-trident can immediately drive updates), + # then ssh + network housekeeping, then initrd rebuild + xattr + # strip last. + - path: scripts/post-install.sh + # Bake trident's hoststatus into the datastore at build time. AZL3 + # does this via update-host-status.sh; AZL4 uses the same pattern + # via update-host-status-azl4.sh. Requires trident's offline-init + # to tolerate the absence of /dev/sda inside MIC's chroot (the + # `disk` argument is a runtime label, not a build-time assertion); + # the fix lives in crates/trident/src/init/offline/mod.rs. + - path: scripts/update-host-status-azl4.sh + - path: scripts/enable-trident-service-azl4.sh + - path: scripts/ssh-move-host-keys-azl4.sh + - path: scripts/enable-regen-sshd-keys.sh + # Rebuild initramfs with --no-hostonly + extra SATA drivers so the + # qcow2 boots regardless of which bus the consumer's libvirt config + # picks (storm-trident uses bus=sata; the original boot test on + # karhu-ubuntu used bus=virtio). MUST run BEFORE strip-selinux-xattrs + # because dracut writes new files with the build-time SELinux + # context, and we want those stripped too. + - path: scripts/rebuild-initrd-azl4.sh + # Strip security.selinux xattrs from all files. See updateimg-grub- + # azl4.yaml for the parallel write-up; the same MOS-side AZL3 + # SELinux policy rejects AZL4 contexts when any future operation + # tries to preserve them. Keeping the qcow2 label-free is defensive. + # MUST run LAST so it sweeps any files produced by earlier scripts + # (initrd, etc.). + - path: scripts/strip-selinux-xattrs.sh diff --git a/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh new file mode 100644 index 0000000000..29889ea587 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Defensive enable of trident.service and tridentd.socket. +# +# AZL3 gets these via the trident-service RPM's %systemd_post scriptlet. +# AZL4 doesn't have that RPM yet, so we ship the units via additionalFiles +# and *should* be able to rely on baseimg-grub-azl4.yaml's `services.enable:` +# stanza. In practice, `services.enable` did not create the +# multi-user.target.wants/trident.service symlink in MIC AZL4 builds +# (build 1120959 showed multi-user.target reached but trident.service +# never started post-reboot, leaving servicingState stuck at +# ab-update-finalized). Until we figure out why, manually link the +# units defensively. +# +# tridentd.socket gets the same treatment because (a) if services.enable +# is unreliable for one unit, it's likely unreliable for the other, and +# (b) storm-trident drives every update/commit/rollback through the +# tridentd gRPC socket — a missing /run/trident/trident.sock at boot +# would fail every subsequent storm-trident invocation in the test +# pipeline. +set -euxo pipefail + +mkdir -p /etc/systemd/system/multi-user.target.wants +mkdir -p /etc/systemd/system/sockets.target.wants +ln -sf /usr/lib/systemd/system/trident.service \ + /etc/systemd/system/multi-user.target.wants/trident.service +ln -sf /usr/lib/systemd/system/tridentd.socket \ + /etc/systemd/system/sockets.target.wants/tridentd.socket + +# Belt and braces: log the enabled state for diagnostics. systemctl is-enabled +# may fail inside MIC's chroot without a running dbus, so don't gate the +# script on it. +systemctl is-enabled trident.service 2>&1 || true +systemctl is-enabled tridentd.socket 2>&1 || true +ls -l /etc/systemd/system/multi-user.target.wants/trident.service || true +ls -l /etc/systemd/system/sockets.target.wants/tridentd.socket || true diff --git a/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh new file mode 100644 index 0000000000..b07b3a8c0a --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Regenerate initrd with --no-hostonly so all storage drivers are +# included, not just the ones MIC's build environment happens to need. +# +# Why: storm-trident's rollback test (tools/storm/utils/vm/qemu/qemu.go) +# attaches the qcow2 to a virt-install VM with `bus=sata`. MIC builds +# the qcow2 in a virtio-backed environment, so dracut's default +# hostonly mode produces an initramfs with only virtio drivers. On a +# SATA-backed boot, the initramfs can't find the root partition by +# UUID and systemd hangs forever waiting for /dev/disk/by-uuid/. +# +# Rebuilding with --no-hostonly bakes in ahci, ata_piix, sata_sil, etc. +# along with virtio so the same qcow2 boots regardless of the bus type +# the consumer chooses. +# +# Runs inside the MIC chroot where /sys and /proc are bind-mounted but +# the host's SELinux is not loaded (MIC strips that), so dracut's +# cp -a doesn't hit the security.selinux setxattr issue that bites in +# AZL3 MOS during install (see strip-selinux-xattrs.sh for the parallel +# write-up). + +set -euo pipefail + +# Find the kernel version installed in this image. We require exactly +# one — `ls | head -1` would silently pick the wrong one if any future +# AZL4 variant ships multiple (kernel + kernel-hyperv, extramodules-*, +# etc.). Fail loudly rather than generate an initramfs for the wrong +# kernel: the failure mode of that misstep is "boot hangs waiting for +# /dev/disk/by-uuid/", which is the exact bug this script is +# meant to prevent. +KVERS=( /usr/lib/modules/* ) +case ${#KVERS[@]} in + 0) + echo "ERROR: no kernel modules dir under /usr/lib/modules" >&2 + exit 1 + ;; + 1) + KVER=$(basename "${KVERS[0]}") + ;; + *) + echo "ERROR: expected exactly one kernel under /usr/lib/modules, found:" >&2 + printf ' %s\n' "${KVERS[@]}" >&2 + exit 1 + ;; +esac +echo "Regenerating initramfs for kernel $KVER with --no-hostonly" + +# `--no-hostonly` includes all storage modules; `--no-hostonly-cmdline` +# prevents dracut from baking the build-host's /proc/cmdline parameters +# into the initramfs (which would fight the qcow2's grub cmdline at +# runtime); `--reproducible` keeps the output bit-stable across builds +# so we can detect spurious regenerations. +dracut \ + --no-hostonly \ + --no-hostonly-cmdline \ + --reproducible \ + --add-drivers "ahci ata_piix sata_sil sata_nv sata_via sd_mod" \ + --force \ + --kver "$KVER" + +echo "Regenerated initramfs:" +ls -lh /boot/initramfs-* diff --git a/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh new file mode 100644 index 0000000000..a2cfbe27f5 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# AZL4 equivalent of AZL3's update-host-status.sh. +# +# Runs inside MIC's chroot at qcow2 build time. Populates the trident +# datastore with the host status derived from Prism's history.json so +# the system boots ready for storm-trident to drive A/B updates -- no +# first-boot bootstrap, no datastore creation at runtime. +# +# Mirrors AZL3's pattern (scripts/update-host-status.sh, called from +# baseimg-grub.yaml). The trident binary in the chroot must understand +# that `--disk /dev/sda` is the runtime label and not a build-time +# existence assertion; see trident PR fixing the spurious check in +# crates/trident/src/init/offline/mod.rs. +set -euxo pipefail + +trident offline-initialize From d026fd45c04968270f6417cb33793477c0303bfb Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:21:03 -0700 Subject: [PATCH 38/42] fix: Remove stale osmodifier additionalFile from baseimg osmodifier is now a Rust crate built into the trident binary (PR #638). No separate osmodifier binary needs to be baked into test images. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../trident-vm-testimage/base/baseimg-grub-azl4.yaml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml index c601b34d9d..6237475c29 100644 --- a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml +++ b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml @@ -8,9 +8,9 @@ # AZL4-specific package names (dnf5, grub2-efi-x64, shim, etc.) matching # the updateimg-grub-azl4.yaml flavor. # -# TODO(azl4-rpm): Drop the trident + osmodifier additionalFiles entries -# once an AZL4 trident-service RPM and azurelinux-image-tools-osmodifier -# RPM are published. Until then we bake the binaries inline. +# TODO(azl4-rpm): Drop the trident additionalFiles entries +# once an AZL4 trident-service RPM is published. Until then we bake +# the binary inline. storage: disks: @@ -124,12 +124,6 @@ os: - source: trident-bin/trident destination: /usr/bin/trident permissions: "755" - # TODO(azl4-osmodifier-rpm): replace with - # `packages.install: - azurelinux-image-tools-osmodifier` - # once the RPM is published. - - source: osmodifier-bin/osmodifier - destination: /usr/bin/osmodifier - permissions: "755" # Trident systemd units. AZL3 gets these from the trident-service # RPM; AZL4 doesn't have that RPM yet so we ship them inline. The From a6c8df964654a92703284e4a4ff29ed5351bcc55 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 10 Jun 2026 22:28:07 -0700 Subject: [PATCH 39/42] tests: Reduce AZL4 baseimg ESP from 64M to 16M Matches AZL3's 16M. Remove stale comment about needing 64M. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../images/trident-vm-testimage/base/baseimg-grub-azl4.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml index 6237475c29..40bdf9c3f6 100644 --- a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml +++ b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml @@ -19,10 +19,7 @@ storage: partitions: - id: esp type: esp - # 64M (vs AZL3's 16M) because AZL4 ships larger grub/shim - # binaries (~5MB grubx64.efi) and trident's offline-init - # copies them to both /boot/efi/EFI/AZLA and /AZLB. - size: 64M + size: 32M - id: root-a size: 4G From acc94e0093f320d3d7daac89f5b22053db6feb70 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:52 -0700 Subject: [PATCH 40/42] infra: Add AZL4 VM rollback test stage via storm-trident Adds AZL4 VM rollback test pipeline stage using storm-trident for automated rollback validation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../testing_rollback/vm-testing-azl4.yml | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 .pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml diff --git a/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml b/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml new file mode 100644 index 0000000000..b9aa7fb91d --- /dev/null +++ b/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml @@ -0,0 +1,222 @@ +# AZL4 VM offline-init rollback test stage. +# +# Complement to testing_vm/netlaunch-testing-azl4.yml (BM-simulated install). +# This stage exercises the VM offline-init path: pre-baked AZL4 qcow2 boots +# directly, then storm-trident drives A/B update from the AZL4 COSI and +# tests rollback. +# +# Inputs (both built by other AZL4 stages in this pipeline): +# - trident-vm-grub-testimage-azl4-base.qcow2 (base qcow2 with trident +# systemd units + first-boot offline-init oneshot) +# - trident-vm-grub-testimage-azl4.cosi (update target, same cosi +# the BM-sim stage uses for fresh installs) +# +# Differences from testing_rollback/testing-template.yml: +# * No test matrix. One configuration: AZL4 rollback. +# * No extension testing (--skip-extension-testing). The AZL4 cosi +# doesn't ship the sysext machinery yet. +# * No netplan runtime testing (--skip-netplan-runtime-testing). +# base-azl4 trident-config omits the os: section because the AZL3 +# MOS install path doesn't have osmodifier available; the qcow2 +# base shouldn't need netplan runtime tweaks either. +# * No manual rollback testing (--skip-manual-rollbacks) for first +# iteration; add once basic A/B works. +# * No runtime updates (--skip-runtime-updates) for first iteration. +# +# When AZL4 grows a trident-service RPM, sysext / netplan / runtime +# variants will reuse the AZL3 testing-template.yml as a matrix entry. + +parameters: + - name: baseQcowArtifact + type: string + default: "trident-vm-grub-testimage-azl4-base" + + - name: cosiArtifact + type: string + default: "trident-vm-grub-testimage-azl4" + + - name: dependsOnStage + type: string + default: "" + + - name: verboseLogging + type: boolean + default: true + + - name: pool + type: string + default: "trident-ubuntu-1es-pool-eastus2" + +stages: + - stage: RollbackTesting_AZL4 + displayName: Rollback Testing - AZL4 (VM offline-init) + dependsOn: + - BuildingTools + - PrepareSSHKeys + - TridentTestImg_trident_vm_grub_testimage_azl4 + - TridentTestImg_trident_vm_grub_testimage_azl4_base + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + + variables: + - group: servicing_testing_params + - name: SSH_PRIVATE_KEY_PATH + value: "$HOME/.ssh/id_rsa" + - name: SSH_PUBLIC_KEY_PATH + value: "$(SSH_PRIVATE_KEY_PATH).pub" + + jobs: + - job: RollbackTestingAzl4 + displayName: Rollback Testing AZL4 + timeoutInMinutes: 30 + pool: + type: linux + name: ${{ parameters.pool }} + hostArchitecture: amd64 + + variables: + ob_outputDirectory: /tmp/deployment_logs_azl4_rollback + ob_artifactBaseName: "rollback-testing-azl4" + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 base qcow2" + inputs: + buildType: current + artifactName: "${{ parameters.baseQcowArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/base-qcow" + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 update COSI" + inputs: + buildType: current + artifactName: "${{ parameters.cosiArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/update-cosi" + + - task: DownloadPipelineArtifact@2 + displayName: "Download SSH keys" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh" + + - task: DownloadPipelineArtifact@2 + displayName: "Download go-tools" + inputs: + buildType: current + artifactName: "go-tools" + patterns: | + netlisten + storm-trident + targetPath: "$(TRIDENT_SOURCE_DIR)/bin" + + - bash: | + set -euxo pipefail + chmod +x $(TRIDENT_SOURCE_DIR)/bin/netlisten + chmod +x $(TRIDENT_SOURCE_DIR)/bin/storm-trident + cp $(Build.ArtifactStagingDirectory)/ssh/id_rsa* ~/.ssh/ + # Targeted permissions on the keys we just staged. Avoid + # `chmod -R 700 ~/.ssh/` because self-hosted agents may + # reuse the directory across jobs and we shouldn't trample + # other tooling's known_hosts / config / id_*. + chmod 700 ~/.ssh/ || true + chmod 600 ~/.ssh/id_rsa + chmod 644 ~/.ssh/id_rsa.pub + mkdir -p $(ob_outputDirectory) + + # Both the qcow2 base build and the COSI build stage the + # shared 'ssh-keys' artifact into their MIC trees (see + # .pipelines/templates/stages/build_image/build-image-template-azl4.yml). + # So the pipeline-wide PrepareSSHKeys id_rsa we just + # copied to ~/.ssh/ matches both A-side and B-side of + # the test VM. No per-build key swap needed. + ls -l ~/.ssh/ + + # storm-trident expects the artifacts laid out under + # one directory. testimages.py output uses a clone-index + # suffix; rename to the conventional names storm-trident + # script prepare-images would produce. + ARTIFACTS=$(Build.ArtifactStagingDirectory)/storm-input + mkdir -p "$ARTIFACTS" + + # storm-trident's qemu deploy looks for a qcow2 matching the + # regex `^trident-vm-.*-testimage.qcow2$` (see + # tools/storm/utils/vm/qemu/qemu.go:34). Our build artifact + # is named trident-vm-grub-testimage-azl4-base.qcow2 which + # doesn't match (-base.qcow2 not -testimage.qcow2 at end). + # Stage it under a name that matches. + QCOW_SRC="" + for c in \ + "$(Build.ArtifactStagingDirectory)/base-qcow/trident-vm-grub-testimage-azl4-base_0.qcow2" \ + "$(Build.ArtifactStagingDirectory)/base-qcow/trident-vm-grub-testimage-azl4-base.qcow2"; do + if [ -f "$c" ]; then QCOW_SRC="$c"; break; fi + done + if [ -z "$QCOW_SRC" ]; then + echo "Could not find AZL4 base qcow2. Contents:" + find "$(Build.ArtifactStagingDirectory)/base-qcow" -type f + exit 1 + fi + cp "$QCOW_SRC" "$ARTIFACTS/trident-vm-azl4-base-testimage.qcow2" + + # storm-trident's rollback test looks for any *.cosi in the + # artifacts dir (see tools/storm/rollback/tests/rollback.go:29). + # No rename needed beyond the clone-index suffix. + COSI_SRC="" + for c in \ + "$(Build.ArtifactStagingDirectory)/update-cosi/trident-vm-grub-testimage-azl4_0.cosi" \ + "$(Build.ArtifactStagingDirectory)/update-cosi/trident-vm-grub-testimage-azl4.cosi"; do + if [ -f "$c" ]; then COSI_SRC="$c"; break; fi + done + if [ -z "$COSI_SRC" ]; then + echo "Could not find AZL4 update COSI. Contents:" + find "$(Build.ArtifactStagingDirectory)/update-cosi" -type f + exit 1 + fi + cp "$COSI_SRC" "$ARTIFACTS/trident-vm-azl4-update-testimage.cosi" + + ls -lh "$ARTIFACTS" + displayName: "Stage artifacts for storm-trident" + workingDirectory: $(TRIDENT_SOURCE_DIR) + + - bash: | + set -euxo pipefail + + STORM_DYNAMIC_FLAGS="" + if [ "${{ parameters.verboseLogging }}" == "True" ]; then + STORM_DYNAMIC_FLAGS="$STORM_DYNAMIC_FLAGS --verbose" + fi + + # First-iteration AZL4 skips: see file header for rationale. + STORM_DYNAMIC_FLAGS="$STORM_DYNAMIC_FLAGS \ + --skip-extension-testing \ + --skip-netplan-runtime-testing \ + --skip-manual-rollbacks \ + --skip-runtime-updates" + + sudo ./bin/storm-trident run rollback -a $STORM_DYNAMIC_FLAGS \ + --artifacts-dir $(Build.ArtifactStagingDirectory)/storm-input \ + --output-path $(ob_outputDirectory) \ + --platform qemu \ + --user testing-user \ + --ssh-private-key-path $(SSH_PRIVATE_KEY_PATH) \ + --ssh-public-key-path $(SSH_PUBLIC_KEY_PATH) \ + --force-cleanup + displayName: "🚀 Storm-trident rollback test (AZL4)" + workingDirectory: $(TRIDENT_SOURCE_DIR) + timeoutInMinutes: 20 + + - bash: | + set -eux + sudo zstd -T0 $(Build.ArtifactStagingDirectory)/booted.qcow2 || true + sudo mv $(Build.ArtifactStagingDirectory)/booted.qcow2.zst $(ob_outputDirectory)/ || true + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: failed() + displayName: "Publish OS disk on failure" + timeoutInMinutes: 5 + + - template: ../testing_common/fix-output-directory-for-one-branch-step.yml + parameters: + outputDir: $(ob_outputDirectory) + condition: always() From c1bd0583d3b0ba112f4767c2efbfa258836a49f9 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:21:03 -0700 Subject: [PATCH 41/42] fix: Use is_some_and instead of map_or for clippy Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osmodifier/src/grub_cfg.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/osmodifier/src/grub_cfg.rs b/crates/osmodifier/src/grub_cfg.rs index 19a9f3bd22..46dd82c971 100644 --- a/crates/osmodifier/src/grub_cfg.rs +++ b/crates/osmodifier/src/grub_cfg.rs @@ -117,7 +117,7 @@ fn extract_options_from_bls_entries(ctx: &OsModifierContext) -> Result Date: Mon, 8 Jun 2026 13:38:56 -0700 Subject: [PATCH 42/42] docs: Remove PR references and stale osmodifier comments from rollback config Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../rollback-azl4/trident-config.yaml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml index e833209068..67a2c574a6 100644 --- a/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml @@ -62,16 +62,12 @@ health: - non-existent-service1.service - non-existent-service2.service timeoutSeconds: 15 -# AZL4 variant of the AZL3 `health-checks-install/` scenario. Adapted for the -# PR-4 hostname-only fast path: -# - Empty top-level `users`/`selinux`/`netplan` so install validation does -# not require the OS Modifier binary to be in the MOS install ISO (which -# does not currently include it; once the MOS rebuild lands, both this -# scenario and base-azl4 can grow os.users / os.selinux / os.netplan). -# - `os.additionalFiles` is the one os.* field used because health.checks -# references `path: /var/lib/trident/local-health-check-file.sh`, which -# needs to be on the target filesystem. additionalFiles is processed by -# Trident's storage / file-deploy paths, not by OS Modifier. +# AZL4 variant of the AZL3 `health-checks-install/` scenario. +# - No `users`/`selinux`/`netplan` — these are baked into the test image +# at MIC build time. +# - `os.additionalFiles` is used because health.checks references +# `path: /var/lib/trident/local-health-check-file.sh`, which needs to +# be on the target filesystem. # # Health-check failure expectations (asserted by tests/e2e_tests/rollback_test.py): # - State transitions to `not-provisioned` (clean-install has no slot to