diff --git a/Cargo.lock b/Cargo.lock index b1bef640f..591896e50 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1297,12 +1297,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - [[package]] name = "http" version = "1.4.0" @@ -1551,7 +1545,6 @@ dependencies = [ "serde", "serde_json", "serial_test", - "sha256", "signal-hook-registry", "tempfile", "termcolor", @@ -3236,19 +3229,6 @@ dependencies = [ "digest", ] -[[package]] -name = "sha256" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f880fc8562bdeb709793f00eb42a2ad0e672c4f883bbe59122b926eca935c8f6" -dependencies = [ - "async-trait", - "bytes", - "hex", - "sha2", - "tokio", -] - [[package]] name = "sharded-slab" version = "0.1.7" diff --git a/docs/hyperlight-surrogate-development-notes.md b/docs/hyperlight-surrogate-development-notes.md index 827093cce..431cc85da 100644 --- a/docs/hyperlight-surrogate-development-notes.md +++ b/docs/hyperlight-surrogate-development-notes.md @@ -4,6 +4,8 @@ > Note: The use of surrogates is a temporary workaround on Windows until WHP allows us to create more than one partition per running process. -These surrogate processes are managed by the host via the [surrogate_process_manager](./src/hyperlight_host/src/hypervisor/surrogate_process_manager.rs) which will launch several of these surrogates (up to the 512), assign memory to them, then launch partitions from there, and reuse them as necessary. +These surrogate processes are managed by the host via the [surrogate_process_manager](./src/hyperlight_host/src/hypervisor/surrogate_process_manager.rs) which pre-creates an initial pool of surrogates at startup (512 by default, configurable via the `HYPERLIGHT_INITIAL_SURROGATES` environment variable). If the pool is exhausted, additional processes are created on demand up to a configurable maximum (`HYPERLIGHT_MAX_SURROGATES`, also defaulting to 512). Once the maximum is reached, callers block until a process is returned to the pool. -`hyperlight_surrogate.exe` gets built during `hyperlight-host`'s build script, gets embedded into the `hyperlight-host` Rust library via [rust-embed](https://crates.io/crates/rust-embed), and is extracted at runtime next to the executable when the surrogate process manager is initialized. \ No newline at end of file +> **Note:** `HYPERLIGHT_MAX_SURROGATES` is authoritative — if `HYPERLIGHT_INITIAL_SURROGATES` exceeds it, the initial count is silently clamped down to the maximum. For example, setting only `HYPERLIGHT_MAX_SURROGATES=256` limits both the initial pool and the ceiling to 256. + +`hyperlight_surrogate.exe` gets built during `hyperlight-host`'s build script, gets embedded into the `hyperlight-host` Rust library via [rust-embed](https://crates.io/crates/rust-embed), and is extracted at runtime next to the executable when the surrogate process manager is initialized. The extracted filename includes a short BLAKE3 hash of the binary content (e.g., `hyperlight_surrogate_a1b2c3d4.exe`) so that multiple hyperlight versions can coexist without file-deletion races. diff --git a/src/hyperlight_host/Cargo.toml b/src/hyperlight_host/Cargo.toml index c183037fc..a30764d0b 100644 --- a/src/hyperlight_host/Cargo.toml +++ b/src/hyperlight_host/Cargo.toml @@ -70,7 +70,6 @@ windows = { version = "0.62", features = [ windows-sys = { version = "0.61", features = ["Win32"] } windows-result = "0.4" rust-embed = { version = "8.11.0", features = ["debug-embed", "include-exclude", "interpolate-folder-path"] } -sha256 = "1.6.0" windows-version = "0.1" lazy_static = "1.4.0" diff --git a/src/hyperlight_host/src/hypervisor/surrogate_process_manager.rs b/src/hyperlight_host/src/hypervisor/surrogate_process_manager.rs index 08c60e370..de15fde0b 100644 --- a/src/hyperlight_host/src/hypervisor/surrogate_process_manager.rs +++ b/src/hyperlight_host/src/hypervisor/surrogate_process_manager.rs @@ -19,10 +19,11 @@ use std::fs::File; use std::io::Write; use std::mem::size_of; use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; -use crossbeam_channel::{Receiver, Sender, unbounded}; +use crossbeam_channel::{Receiver, Sender, TryRecvError, unbounded}; use rust_embed::RustEmbed; -use tracing::{Span, error, info, instrument}; +use tracing::{Span, error, info, instrument, warn}; use windows::Win32::Foundation::HANDLE; use windows::Win32::Security::SECURITY_ATTRIBUTES; use windows::Win32::System::JobObjects::{ @@ -49,14 +50,87 @@ use crate::{Result, log_then_return, new_error}; #[include = "hyperlight_surrogate.exe"] struct Asset; -/// This is the name of the surrogate process binary that will be used to create surrogate processes. -/// The process does nothing , it just sleeps forever. Its only purpose is to provide a host for memory that will be mapped -/// into the guest using the `WHvMapGpaRange2` API. -pub(crate) const SURROGATE_PROCESS_BINARY_NAME: &str = "hyperlight_surrogate.exe"; +/// The name of the embedded surrogate asset (used as the key for `Asset::get`). +const EMBEDDED_SURROGATE_NAME: &str = "hyperlight_surrogate.exe"; + +/// The absolute hard limit on surrogate processes imposed by the +/// `WHvMapGpaRange2` API (512 process handles per calling process). +const HARD_MAX_SURROGATE_PROCESSES: usize = 512; + +/// Environment variable controlling how many surrogate processes are +/// pre-created when the manager starts. Must be between 1 and +/// `HARD_MAX_SURROGATE_PROCESSES` (512). Defaults to 512 if unset. +const INITIAL_SURROGATES_ENV_VAR: &str = "HYPERLIGHT_INITIAL_SURROGATES"; + +/// Environment variable controlling the maximum number of surrogate processes +/// that can exist (including those created on demand). Must be >= +/// `HYPERLIGHT_INITIAL_SURROGATES` and <= `HARD_MAX_SURROGATE_PROCESSES` +/// (512). Defaults to 512 if unset. +const MAX_SURROGATES_ENV_VAR: &str = "HYPERLIGHT_MAX_SURROGATES"; + +/// Returns the on-disk filename for the surrogate binary, incorporating the +/// first 8 hex characters of the BLAKE3 hash of the embedded binary so that +/// different hyperlight versions produce different filenames and can coexist +/// without file-deletion races. +fn surrogate_binary_name() -> Result { + let exe = Asset::get(EMBEDDED_SURROGATE_NAME) + .ok_or_else(|| new_error!("could not find embedded surrogate binary"))?; + let hash = blake3::hash(exe.data.as_ref()); + let short_hash = &hash.to_hex()[..8]; + Ok(format!("hyperlight_surrogate_{short_hash}.exe")) +} + +/// Pure validation/clamping logic for surrogate process counts. +/// +/// `raw_initial` and `raw_max` are the parsed values from the environment +/// (or `None` when the variable is unset or unparsable). +/// +/// Resolution order: +/// 1. `max` is clamped to `1..=HARD_MAX_SURROGATE_PROCESSES`, defaulting +/// to `HARD_MAX_SURROGATE_PROCESSES` when `None`. +/// 2. `initial` is clamped to `1..=max`, defaulting to `max` when `None`. +/// This guarantees `initial <= max` without an extra conditional. +fn compute_surrogate_counts(raw_initial: Option, raw_max: Option) -> (usize, usize) { + let max = raw_max + .map(|n| n.clamp(1, HARD_MAX_SURROGATE_PROCESSES)) + .unwrap_or(HARD_MAX_SURROGATE_PROCESSES); + + // Clamp initial to 1..=max so it can never exceed the authoritative limit. + let initial = raw_initial.map(|n| n.clamp(1, max)).unwrap_or(max); + + (initial, max) +} -/// The maximum number of surrogate processes that can be created. -/// (This is a factor of limitations in the `WHvMapGpaRange2` API which only allows 512 different process handles). -const NUMBER_OF_SURROGATE_PROCESSES: usize = 512; +/// Returns the (initial, max) surrogate process counts from environment +/// variables, applying validation and clamping. +/// +/// - `HYPERLIGHT_INITIAL_SURROGATES`: clamped to `1..=max`, default `max`. +/// - `HYPERLIGHT_MAX_SURROGATES`: clamped to `1..=512`, default 512. +fn surrogate_process_counts() -> (usize, usize) { + let raw_initial = std::env::var(INITIAL_SURROGATES_ENV_VAR) + .ok() + .and_then(|v| v.parse::().ok()); + let raw_max = std::env::var(MAX_SURROGATES_ENV_VAR) + .ok() + .and_then(|v| v.parse::().ok()); + + let (initial, max) = compute_surrogate_counts(raw_initial, raw_max); + + // Log clamping warnings here (not in the pure function) so the + // messages can reference the env var names that provide context. + if let Some(n) = raw_initial + && n != initial + { + warn!("{INITIAL_SURROGATES_ENV_VAR}={n} was clamped to {initial}"); + } + if let Some(n) = raw_max + && n != max + { + warn!("{MAX_SURROGATES_ENV_VAR}={n} was clamped to {max}"); + } + + (initial, max) +} /// `SurrogateProcessManager` manages hyperlight_surrogate processes. These /// processes are required to allow multiple WHP Partitions to be created in a @@ -86,11 +160,14 @@ const NUMBER_OF_SURROGATE_PROCESSES: usize = 512; /// on allocation/return to/from a Sandbox instance. /// It is intended to be used as a singleton and is thread safe. /// -/// There is a limit of 512 partitions per process therefore this class will -/// create a maximum of 512 processes, and if the pool is empty when a Sandbox -/// is created it will wait for a free process. +/// There is a limit of 512 partitions per process. By default 512 processes +/// are pre-created at startup, but this can be reduced via +/// `HYPERLIGHT_INITIAL_SURROGATES`. Additional processes are created on +/// demand up to the limit set by `HYPERLIGHT_MAX_SURROGATES` (also +/// defaulting to 512). If the pool is empty and the max has been reached, +/// callers will block until a process is returned. /// -/// This class is `Send + Sync`, and internally manages the pool of 512 +/// This class is `Send + Sync`, and internally manages the pool of /// surrogate processes in a concurrency-safe way. pub(crate) struct SurrogateProcessManager { job_handle: HandleWrapper, @@ -111,14 +188,23 @@ pub(crate) struct SurrogateProcessManager { /// - ... and the `Hypervisor` trait requires `Send + Sync` process_receiver: Receiver, process_sender: Sender, + /// Path to the on-disk surrogate binary (hash-stamped). + surrogate_process_path: PathBuf, + /// Maximum number of surrogate processes allowed to exist. + max_processes: usize, + /// Number of surrogate processes that have been created so far. + /// Used to decide whether we can spawn more on demand. + created_count: AtomicUsize, } impl SurrogateProcessManager { #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] fn new() -> Result { - ensure_surrogate_process_exe()?; - let surrogate_process_path = - get_surrogate_process_dir()?.join(SURROGATE_PROCESS_BINARY_NAME); + let binary_name = surrogate_binary_name()?; + ensure_surrogate_process_exe(&binary_name)?; + let surrogate_process_path = get_surrogate_process_dir()?.join(&binary_name); + + let (initial, max) = surrogate_process_counts(); let (sender, receiver) = unbounded(); let job_handle = create_job_object()?; @@ -126,17 +212,71 @@ impl SurrogateProcessManager { job_handle, process_receiver: receiver, process_sender: sender, + surrogate_process_path, + max_processes: max, + created_count: AtomicUsize::new(0), }; - surrogate_process_manager - .create_surrogate_processes(&surrogate_process_path, job_handle)?; + surrogate_process_manager.create_initial_surrogate_processes(initial)?; + Ok(surrogate_process_manager) } - /// Gets a surrogate process from the pool of surrogate processes and - /// allocates memory in the process. This should be called when a new - /// HyperV on Windows Driver is created. + /// Gets a surrogate process from the pool. If the pool is empty and + /// fewer than `max_processes` have been created, a new process is + /// spawned on demand. If the pool is empty and the maximum has been + /// reached, this call blocks until a process is returned. #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] pub(super) fn get_surrogate_process(&self) -> Result { + // Fast path: try to grab an already-pooled process. + match self.process_receiver.try_recv() { + Ok(handle) => { + let surrogate_process_handle: HANDLE = handle.into(); + return Ok(SurrogateProcess::new(surrogate_process_handle)); + } + Err(TryRecvError::Empty) => { + // Pool is empty — try to grow on demand below. + } + Err(TryRecvError::Disconnected) => { + return Err(new_error!("surrogate process channel disconnected")); + } + } + + // On-demand growth: atomically claim a slot if one is available. + // We use a CAS loop so that concurrent callers don't overshoot + // the maximum. + loop { + let current = self.created_count.load(Ordering::Acquire); + if current >= self.max_processes { + // At the limit — fall through to the blocking recv below. + break; + } + if self + .created_count + .compare_exchange(current, current + 1, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + info!( + "on-demand surrogate process creation ({}/{})", + current + 1, + self.max_processes + ); + let handle = + match create_surrogate_process(&self.surrogate_process_path, self.job_handle) { + Ok(h) => h, + Err(e) => { + // Rollback the slot claim so capacity isn't + // permanently lost on transient failures. + self.created_count.fetch_sub(1, Ordering::AcqRel); + return Err(e); + } + }; + let surrogate_process_handle: HANDLE = handle.into(); + return Ok(SurrogateProcess::new(surrogate_process_handle)); + } + // CAS failed — another thread beat us; retry. + } + + // Maximum reached — block until a process is returned to the pool. let surrogate_process_handle: HANDLE = self.process_receiver.recv()?.into(); Ok(SurrogateProcess::new(surrogate_process_handle)) } @@ -146,19 +286,25 @@ impl SurrogateProcessManager { /// implementation, after process resources have been freed. #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] pub(super) fn return_surrogate_process(&self, proc_handle: HandleWrapper) -> Result<()> { - Ok(self.process_sender.clone().send(proc_handle)?) + Ok(self.process_sender.send(proc_handle)?) } - /// Creates all the surrogate process when the struct is first created. + /// Pre-creates the initial batch of surrogate processes at startup. #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] - fn create_surrogate_processes( - &self, - surrogate_process_path: &Path, - job_handle: HandleWrapper, - ) -> Result<()> { - for _ in 0..NUMBER_OF_SURROGATE_PROCESSES { - let surrogate_process = create_surrogate_process(surrogate_process_path, job_handle)?; - self.process_sender.clone().send(surrogate_process)?; + fn create_initial_surrogate_processes(&self, initial_count: usize) -> Result<()> { + info!( + "pre-creating {} surrogate processes ({}={:?}, {}={:?})", + initial_count, + INITIAL_SURROGATES_ENV_VAR, + std::env::var(INITIAL_SURROGATES_ENV_VAR).ok(), + MAX_SURROGATES_ENV_VAR, + std::env::var(MAX_SURROGATES_ENV_VAR).ok(), + ); + for _ in 0..initial_count { + let surrogate_process = + create_surrogate_process(&self.surrogate_process_path, self.job_handle)?; + self.process_sender.send(surrogate_process)?; + self.created_count.fetch_add(1, Ordering::AcqRel); } Ok(()) @@ -247,40 +393,53 @@ fn get_surrogate_process_dir() -> Result { Ok(path.to_path_buf()) } +/// Ensures the surrogate binary exists on disk at the hash-stamped path. +/// +/// Because the filename embeds the content hash, two different hyperlight +/// versions will write to *different* files. This avoids the +/// delete-while-running race that occurred when an older version's running +/// processes held a lock on the same filename. +/// +/// Uses `File::create_new` for atomic create-or-fail semantics, avoiding +/// a TOCTOU race with `exists()` + `create()`. If the write fails after +/// creating the file, the partial file is deleted so future runs do not +/// mistake it for a valid binary. #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] -fn ensure_surrogate_process_exe() -> Result<()> { - let surrogate_process_path = get_surrogate_process_dir()?.join(SURROGATE_PROCESS_BINARY_NAME); - let p = Path::new(&surrogate_process_path); +fn ensure_surrogate_process_exe(binary_name: &str) -> Result<()> { + let dir = get_surrogate_process_dir()?; + let surrogate_process_path = dir.join(binary_name); - let exe = Asset::get(SURROGATE_PROCESS_BINARY_NAME) + // Resolve the embedded asset before touching the filesystem so a + // missing asset can't leave a zero-byte ghost file on disk. + let exe = Asset::get(EMBEDDED_SURROGATE_NAME) .ok_or_else(|| new_error!("could not find embedded surrogate binary"))?; - if p.exists() { - // check to see if sha's match and if not delete the file so we'll extract - // the embedded file below. - let embedded_file_sha = sha256::digest(exe.data.as_ref()); - let file_on_disk_sha = sha256::try_digest(&p)?; - - if embedded_file_sha != file_on_disk_sha { - println!( - "sha of embedded surrogate '{}' does not match sha of file on disk '{}' - deleting surrogate binary at {}", - embedded_file_sha, - file_on_disk_sha, + // Atomic create-or-fail: if the file already exists, `create_new` + // returns `AlreadyExists` and we skip extraction. The filename + // embeds the content hash, so an existing file is guaranteed to + // have the correct content. + match File::create_new(&surrogate_process_path) { + Ok(mut f) => { + info!( + "{} does not exist, extracting to {}", + binary_name, &surrogate_process_path.display() ); - std::fs::remove_file(p)?; - } - } - - if !p.exists() { - info!( - "{} does not exist, copying to {}", - SURROGATE_PROCESS_BINARY_NAME, - &surrogate_process_path.display() - ); - let mut f = File::create(&surrogate_process_path)?; - f.write_all(exe.data.as_ref())?; + if let Err(e) = f.write_all(exe.data.as_ref()) { + // Clean up the partial file so future runs don't skip + // extraction thinking a valid binary is already present. + // Drop the file handle first — on Windows, open handles + // prevent deletion. + drop(f); + let _ = std::fs::remove_file(&surrogate_process_path); + return Err(e.into()); + } + } + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // Another thread/process already extracted the file — nothing to do. + } + Err(e) => return Err(e.into()), } Ok(()) @@ -358,7 +517,9 @@ mod tests { // create more threads than surrogate processes as we want to test that // the manager can handle multiple threads requesting processes at the // same time when there are not enough processes available. - for t in 0..NUMBER_OF_SURROGATE_PROCESSES * 2 { + let surrogate_process_manager = get_surrogate_process_manager().unwrap(); + let max_processes = surrogate_process_manager.max_processes; + for t in 0..max_processes * 2 { let thread_handle = thread::spawn(move || -> Result<()> { let surrogate_process_manager_res = get_surrogate_process_manager(); let mut rng = rng(); @@ -368,7 +529,7 @@ mod tests { // for each of the parent loop iterations, try to get a // surrogate process, make sure we actually got one, // then put it back - for p in 0..NUMBER_OF_SURROGATE_PROCESSES { + for p in 0..max_processes { let timer = Instant::now(); let surrogate_process = { let res = surrogate_process_manager.get_surrogate_process()?; @@ -406,12 +567,13 @@ mod tests { assert!(thread_handle.join().is_ok()); } - assert_number_of_surrogate_processes(NUMBER_OF_SURROGATE_PROCESSES); + assert_number_of_surrogate_processes(max_processes); } #[track_caller] fn assert_number_of_surrogate_processes(expected_count: usize) { - let sleep_count = 10; + const MAX_RETRIES: u32 = 30; + let mut attempt = 0; loop { let snapshot_handle = unsafe { CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0) }; assert!(snapshot_handle.is_ok()); @@ -425,7 +587,8 @@ mod tests { while result { if let Ok(process_name) = unsafe { CStr::from_ptr(process_entry.szExeFile.as_ptr()).to_str() } - && process_name == SURROGATE_PROCESS_BINARY_NAME + && process_name.starts_with("hyperlight_surrogate_") + && process_name.ends_with(".exe") { count += 1; } @@ -435,9 +598,10 @@ mod tests { } } - // if the expected count is 0, we are waiting for the processes to exit, this doesnt happen immediately, so we wait for a bit - - if (expected_count == 0) && (count > 0) && (sleep_count < 30) { + // When waiting for processes to exit (expected_count == 0), + // retry with a delay since termination isn't instantaneous. + attempt += 1; + if expected_count == 0 && count > 0 && attempt < MAX_RETRIES { thread::sleep(Duration::from_secs(1)); } else { assert_eq!(count, expected_count); @@ -622,4 +786,174 @@ mod tests { "Mapping should be removed after ref count reaches 0" ); } + + /// Tests `ensure_surrogate_process_exe` for: + /// 1. Correct extraction — file content matches the embedded binary. + /// 2. Re-extraction after deletion — a missing file is recreated. + /// 3. Idempotent when already present — second call succeeds without + /// error (exercises the `AlreadyExists` fast path). + /// 4. Deterministic naming — same embedded binary always produces + /// the same hash-stamped filename. + /// + /// Uses a test-specific filename to avoid conflicting with the + /// singleton's surrogate processes, which hold a PE loader lock + /// on the real hash-stamped exe. + #[test] + fn test_ensure_surrogate_exe() { + let test_binary_name = "hyperlight_surrogate_test_extraction.exe"; + let dir = get_surrogate_process_dir().expect("should get surrogate dir"); + let path = dir.join(test_binary_name); + + // Ensure a clean slate. + let _ = std::fs::remove_file(&path); + + // --- First call: extracts the binary --- + ensure_surrogate_process_exe(test_binary_name).expect("first call should succeed"); + assert!(path.exists(), "binary should exist after extraction"); + + // --- Verify extracted content matches embedded binary --- + let on_disk = std::fs::read(&path).expect("should read extracted file"); + let embedded = Asset::get(EMBEDDED_SURROGATE_NAME).expect("embedded asset should exist"); + assert_eq!( + on_disk, + embedded.data.as_ref(), + "extracted file content should match embedded binary" + ); + + // --- Second call: file exists, should skip (AlreadyExists path) --- + ensure_surrogate_process_exe(test_binary_name) + .expect("second call should succeed when file already exists"); + + // --- Delete and re-extract --- + std::fs::remove_file(&path).expect("should be able to delete test binary"); + assert!(!path.exists(), "binary should be gone after deletion"); + + ensure_surrogate_process_exe(test_binary_name) + .expect("should succeed re-extracting after deletion"); + assert!(path.exists(), "binary should be re-created after deletion"); + + // Clean up test artifact. + let _ = std::fs::remove_file(&path); + + // --- Verify deterministic naming --- + let binary_name = surrogate_binary_name().expect("should succeed"); + let binary_name_2 = surrogate_binary_name().expect("second call should also succeed"); + assert_eq!( + binary_name, binary_name_2, + "surrogate_binary_name should be deterministic" + ); + } + + /// Verifies `compute_surrogate_counts()` returns sensible defaults + /// when inputs are `None`, and correct clamped values otherwise. + /// + /// This exercises the pure validation/clamping function directly, + /// avoiding process-global env var mutation which previously caused + /// a race with the `lazy_static` singleton initialisation in + /// parallel test runs. + #[test] + fn test_compute_surrogate_counts() { + // --- Both unset (or unparsable) → defaults --- + let (initial, max) = compute_surrogate_counts(None, None); + assert_eq!( + initial, HARD_MAX_SURROGATE_PROCESSES, + "default initial should be {HARD_MAX_SURROGATE_PROCESSES}" + ); + assert_eq!( + max, HARD_MAX_SURROGATE_PROCESSES, + "default max should be {HARD_MAX_SURROGATE_PROCESSES}" + ); + + // --- Only initial set --- + let (initial, max) = compute_surrogate_counts(Some(32), None); + assert_eq!(initial, 32, "initial should honour provided value"); + assert_eq!( + max, HARD_MAX_SURROGATE_PROCESSES, + "max should default when unset" + ); + + // --- Both set, max > initial --- + let (initial, max) = compute_surrogate_counts(Some(8), Some(64)); + assert_eq!(initial, 8); + assert_eq!(max, 64); + + // --- Both set, max < initial → initial clamped DOWN to max --- + let (initial, max) = compute_surrogate_counts(Some(100), Some(10)); + assert_eq!(max, 10, "max is authoritative and should not be inflated"); + assert_eq!( + initial, 10, + "initial should be clamped down to max when it exceeds it" + ); + + // --- initial below minimum → clamped to 1 --- + let (initial, max) = compute_surrogate_counts(Some(0), None); + assert_eq!(initial, 1, "initial should be clamped to minimum of 1"); + assert_eq!( + max, HARD_MAX_SURROGATE_PROCESSES, + "max should default when unset" + ); + + // --- initial above hard limit → clamped to 512 --- + let (initial, max) = compute_surrogate_counts(Some(9999), None); + assert_eq!( + initial, HARD_MAX_SURROGATE_PROCESSES, + "initial should be clamped to {HARD_MAX_SURROGATE_PROCESSES}" + ); + assert_eq!(max, HARD_MAX_SURROGATE_PROCESSES); + + // --- Only max set → initial defaults then clamped down to max --- + let (initial, max) = compute_surrogate_counts(None, Some(256)); + assert_eq!(max, 256, "max should honour provided value"); + assert_eq!( + initial, 256, + "initial should be clamped down to max when it defaults above it" + ); + + // --- max below minimum → clamped to 1, initial follows --- + let (initial, max) = compute_surrogate_counts(None, Some(0)); + assert_eq!(max, 1, "max should be clamped to minimum of 1"); + assert_eq!(initial, 1, "initial should be clamped down to max"); + + // --- max above hard limit → clamped to 512 --- + let (initial, max) = compute_surrogate_counts(None, Some(9999)); + assert_eq!( + max, HARD_MAX_SURROGATE_PROCESSES, + "max should be clamped to {HARD_MAX_SURROGATE_PROCESSES}" + ); + assert_eq!(initial, HARD_MAX_SURROGATE_PROCESSES); + + // --- Both at boundary values --- + let (initial, max) = compute_surrogate_counts(Some(1), Some(1)); + assert_eq!(initial, 1); + assert_eq!(max, 1); + + let (initial, max) = compute_surrogate_counts( + Some(HARD_MAX_SURROGATE_PROCESSES), + Some(HARD_MAX_SURROGATE_PROCESSES), + ); + assert_eq!(initial, HARD_MAX_SURROGATE_PROCESSES); + assert_eq!(max, HARD_MAX_SURROGATE_PROCESSES); + } + + /// Smoke-tests `surrogate_process_counts()` with the default test + /// environment (neither env var set). This does NOT mutate env vars — + /// it just verifies the env-reading wrapper returns the expected + /// defaults under normal conditions. + #[test] + fn test_surrogate_process_counts_defaults() { + // In the standard CI / dev environment, neither HYPERLIGHT_INITIAL_SURROGATES + // nor HYPERLIGHT_MAX_SURROGATES is set, so we expect the hard-max defaults. + // If the env vars ARE set externally (e.g. a developer's shell), this test + // gracefully adapts: it only asserts the invariant initial <= max <= 512. + let (initial, max) = surrogate_process_counts(); + assert!( + (1..=HARD_MAX_SURROGATE_PROCESSES).contains(&initial), + "initial {initial} should be in 1..={HARD_MAX_SURROGATE_PROCESSES}" + ); + assert!( + (1..=HARD_MAX_SURROGATE_PROCESSES).contains(&max), + "max {max} should be in 1..={HARD_MAX_SURROGATE_PROCESSES}" + ); + assert!(initial <= max, "initial ({initial}) must be <= max ({max})"); + } }