From e757dd7a390bd2d85fdb1da9c7ad97203ae295d7 Mon Sep 17 00:00:00 2001 From: James Munns Date: Fri, 8 May 2026 14:27:43 +0200 Subject: [PATCH 01/12] Add ereports for host panic/boot failure. Closes #2140 Related to #2337 --- Cargo.lock | 2 + app/cosmo/base.toml | 2 +- task/host-sp-comms/Cargo.toml | 4 ++ task/host-sp-comms/src/main.rs | 117 +++++++++++++++++++++++++++++---- 4 files changed, 113 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5f340d18..7e1a79b16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5893,12 +5893,14 @@ dependencies = [ "drv-stm32h7-usart", "drv-stm32xx-sys-api", "enum-map", + "ereports", "heapless", "host-sp-messages", "hubpack", "idol", "idol-runtime", "ksz8463", + "microcbor", "multitimer", "num-traits", "oxide-barcode", diff --git a/app/cosmo/base.toml b/app/cosmo/base.toml index 4fbb3beaf..acc793a9a 100644 --- a/app/cosmo/base.toml +++ b/app/cosmo/base.toml @@ -258,7 +258,7 @@ features = ["stm32h753", "usart6", "baud_rate_3M", "hardware_flow_control", "vla uses = ["usart6", "dbgmcu"] interrupts = {"usart6.irq" = "usart-irq"} priority = 9 -max-sizes = {flash = 70000, ram = 65536} +max-sizes = {flash = 72000, ram = 65536} stacksize = 5400 start = true task-slots = ["sys", { cpu_seq = "cosmo_seq" }, "hf", "control_plane_agent", "net", "packrat", "i2c_driver", { spi_driver = "spi2_driver" }, "sprot", "auxflash"] diff --git a/task/host-sp-comms/Cargo.toml b/task/host-sp-comms/Cargo.toml index c5cc83dbf..fd5fdb76a 100644 --- a/task/host-sp-comms/Cargo.toml +++ b/task/host-sp-comms/Cargo.toml @@ -48,6 +48,10 @@ task-sensor-api = { path = "../../task/sensor-api", optional = true, features = ksz8463 = { path = "../../drv/ksz8463", optional = true } drv-sprot-api = { path = "../../drv/sprot-api"} +# ereports deps +ereports = { path = "../../lib/ereports", features = ["ereporter-macro"] } +microcbor = { path = "../../lib/microcbor" } + [build-dependencies] build-util.path = "../../build/util" build-i2c = { path = "../../build/i2c", optional = true } diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 61a0b28df..5ff34cc1b 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -28,6 +28,7 @@ use host_sp_messages::{ }; use hubpack::SerializedSize; use idol_runtime::{NotificationHandler, RequestError}; +use microcbor::Encode; use multitimer::{Multitimer, Repeat}; use ringbuf::{counted_ringbuf, ringbuf_entry}; use static_assertions::const_assert; @@ -275,6 +276,14 @@ impl HostKeyValueStorage { } } +/// Metadata about panics observed from the host +struct HostPanicMetadata { + /// Length in bytes of the currently stored panic message + total_length: usize, + /// (hopefully not) Rolling counter of panic messages observed this power cycle + total_count: u32, +} + struct ServerImpl { uart: Usart, sys: sys_api::Sys, @@ -292,6 +301,10 @@ struct ServerImpl { host_kv_storage: HostKeyValueStorage, hf_mux_state: Option, + ereporter: Ereporter, + host_panic_state: Option, + host_boot_fail_state: Option, + /// Temporary space for inventory data, which is a large `enum` scratch: &'static mut host_sp_messages::InventoryData, @@ -380,6 +393,9 @@ impl ServerImpl { }); BUFS.claim() }; + + let packrat = Packrat::from(PACKRAT.get_task_id()); + Self { uart, sys, @@ -399,7 +415,7 @@ impl ServerImpl { cp_agent: ControlPlaneAgent::from( CONTROL_PLANE_AGENT.get_task_id(), ), - packrat: Packrat::from(PACKRAT.get_task_id()), + packrat: packrat.clone(), sprot: SpRot::from(SPROT.get_task_id()), reboot_state: None, host_kv_storage: HostKeyValueStorage { @@ -414,6 +430,9 @@ impl ServerImpl { hf_mux_state: None, last_power_off: None, scratch, + host_panic_state: None, + host_boot_fail_state: None, + ereporter: Ereporter::claim_static_resources(packrat), } } @@ -986,12 +1005,32 @@ impl ServerImpl { data.len(), self.host_kv_storage.last_boot_fail.len(), ); - self.host_kv_storage.last_boot_fail[..n] - .copy_from_slice(&data[..n]); - for b in &mut self.host_kv_storage.last_boot_fail[n..] { - *b = 0; - } + let (now, later) = + self.host_kv_storage.last_boot_fail.split_at_mut(n); + now.copy_from_slice(&data[..n]); + later.iter_mut().for_each(|b| *b = 0); + self.host_kv_storage.last_boot_fail_reason = reason; + + // Take the old count, if any, and add one to it. If that count wrapped, + // or if we didn't have an old count, set it to 1, so we never return + // a count of zero if we've ever observed a boot failure. + let new_ct = self + .host_boot_fail_state + .take() + .map(|s| s.total_count.wrapping_add(1)) + .unwrap_or(0) + .max(1); + self.host_boot_fail_state = Some(HostPanicMetadata { + total_length: n, + total_count: new_ct, + }); + + // ereport! + _ = self.ereporter.deliver_ereport(&HostBootFail { + ttl_ct: new_ct, + pan_len: n as u32, + }); Some(SpToHost::Ack) } HostToSp::HostPanic => { @@ -1011,11 +1050,30 @@ impl ServerImpl { data.len(), self.host_kv_storage.last_panic.len(), ); - self.host_kv_storage.last_panic[..n] - .copy_from_slice(&data[..n]); - for b in &mut self.host_kv_storage.last_panic[n..] { - *b = 0; - } + let (now, later) = + self.host_kv_storage.last_panic.split_at_mut(n); + now.copy_from_slice(&data[..n]); + later.iter_mut().for_each(|b| *b = 0); + + // Take the old count, if any, and add one to it. If that count wrapped, + // or if we didn't have an old count, set it to 1, so we never return + // a count of zero if we've ever observed a boot failure. + let new_ct = self + .host_panic_state + .take() + .map(|s| s.total_count.wrapping_add(1)) + .unwrap_or(0) + .max(1); + self.host_panic_state = Some(HostPanicMetadata { + total_length: n, + total_count: new_ct, + }); + + _ = self.ereporter.deliver_ereport(&HostPanic { + ttl_ct: new_ct, + pan_len: n as u32, + }); + Some(SpToHost::Ack) } HostToSp::GetStatus => { @@ -2012,3 +2070,40 @@ mod idl { } include!(concat!(env!("OUT_DIR"), "/notifications.rs")); + +ereports::declare_ereporter! { + struct Ereporter { + HostPanic(HostPanic), + BootPanic(HostBootFail), + } +} + +/// An ereport represent a host reported panic +#[derive(Encode)] +#[ereport(class = "hw.host.panic", version = 0)] +struct HostPanic { + /// The total number of panics observed this boot cycle. + /// + /// This count will wrap, but is guaranteed to never be zero. + ttl_ct: u32, + /// The length, in bytes, of the stored panic message. + /// + /// This quantity may be less than the amount received, as it is capped + /// by the available storage space allocated (`MAX_HOST_FAIL_MESSAGE_LEN`). + pan_len: u32, +} + +/// An ereport represent a host reported boot failure +#[derive(Encode)] +#[ereport(class = "hw.host.btfail", version = 0)] +struct HostBootFail { + /// The total number of panics observed this boot cycle. + /// + /// This count will wrap, but is guaranteed to never be zero. + ttl_ct: u32, + /// The length, in bytes, of the stored panic message. + /// + /// This quantity may be less than the amount received, as it is capped + /// by the available storage space allocated (`MAX_HOST_FAIL_MESSAGE_LEN`). + pan_len: u32, +} From d77c642e3b598807b2757a639ffaa8812c5e0949 Mon Sep 17 00:00:00 2001 From: James Munns Date: Fri, 8 May 2026 14:43:10 +0200 Subject: [PATCH 02/12] Also increase flash size for gimlet base --- app/gimlet/base.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/gimlet/base.toml b/app/gimlet/base.toml index 05b740397..657049a97 100644 --- a/app/gimlet/base.toml +++ b/app/gimlet/base.toml @@ -243,7 +243,7 @@ features = ["stm32h753", "uart7", "baud_rate_3M", "hardware_flow_control", "vlan uses = ["uart7", "dbgmcu"] interrupts = {"uart7.irq" = "usart-irq"} priority = 8 -max-sizes = {flash = 70000, ram = 65536} +max-sizes = {flash = 72000, ram = 65536} stacksize = 5376 start = true task-slots = ["sys", { cpu_seq = "gimlet_seq" }, "hf", "control_plane_agent", "net", "packrat", "i2c_driver", { spi_driver = "spi2_driver" }, "sprot"] From 6b3c1fbe58388a5d776087480574cce08f1f2c98 Mon Sep 17 00:00:00 2001 From: James Munns Date: Fri, 8 May 2026 16:00:15 +0200 Subject: [PATCH 03/12] Underscore not-yet-used field --- task/host-sp-comms/src/main.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 5ff34cc1b..0e8fc19b0 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -279,7 +279,8 @@ impl HostKeyValueStorage { /// Metadata about panics observed from the host struct HostPanicMetadata { /// Length in bytes of the currently stored panic message - total_length: usize, + /// (not currently used, will be used in https://github.com/oxidecomputer/hubris/issues/2504) + _total_length: usize, /// (hopefully not) Rolling counter of panic messages observed this power cycle total_count: u32, } @@ -1022,7 +1023,7 @@ impl ServerImpl { .unwrap_or(0) .max(1); self.host_boot_fail_state = Some(HostPanicMetadata { - total_length: n, + _total_length: n, total_count: new_ct, }); @@ -1065,7 +1066,7 @@ impl ServerImpl { .unwrap_or(0) .max(1); self.host_panic_state = Some(HostPanicMetadata { - total_length: n, + _total_length: n, total_count: new_ct, }); From ff3406868533d92c47d9a0e5cfcdbf67214f7dd6 Mon Sep 17 00:00:00 2001 From: James Munns Date: Thu, 14 May 2026 15:55:03 -0700 Subject: [PATCH 04/12] Apply suggestions from code review Co-authored-by: Eliza Weisman --- task/host-sp-comms/src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 0e8fc19b0..df00cbca8 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -2081,7 +2081,7 @@ ereports::declare_ereporter! { /// An ereport represent a host reported panic #[derive(Encode)] -#[ereport(class = "hw.host.panic", version = 0)] +#[ereport(class = "host.panic", version = 0)] struct HostPanic { /// The total number of panics observed this boot cycle. /// @@ -2091,12 +2091,12 @@ struct HostPanic { /// /// This quantity may be less than the amount received, as it is capped /// by the available storage space allocated (`MAX_HOST_FAIL_MESSAGE_LEN`). - pan_len: u32, + msglen: u32, } /// An ereport represent a host reported boot failure #[derive(Encode)] -#[ereport(class = "hw.host.btfail", version = 0)] +#[ereport(class = "host.btfail", version = 0)] struct HostBootFail { /// The total number of panics observed this boot cycle. /// From f8a394556046db26536331bc81893699dd9fffd4 Mon Sep 17 00:00:00 2001 From: James Munns Date: Thu, 14 May 2026 16:00:31 -0700 Subject: [PATCH 05/12] Address review comments --- task/host-sp-comms/src/main.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index df00cbca8..763f1441c 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -1029,8 +1029,9 @@ impl ServerImpl { // ereport! _ = self.ereporter.deliver_ereport(&HostBootFail { - ttl_ct: new_ct, - pan_len: n as u32, + n: new_ct, + msglen: n as u32, + reason, }); Some(SpToHost::Ack) } @@ -1071,8 +1072,8 @@ impl ServerImpl { }); _ = self.ereporter.deliver_ereport(&HostPanic { - ttl_ct: new_ct, - pan_len: n as u32, + n: new_ct, + msglen: n as u32, }); Some(SpToHost::Ack) @@ -2083,10 +2084,11 @@ ereports::declare_ereporter! { #[derive(Encode)] #[ereport(class = "host.panic", version = 0)] struct HostPanic { - /// The total number of panics observed this boot cycle. + /// The total number of host panics observed by this invocation of + /// host-sp-comms. /// /// This count will wrap, but is guaranteed to never be zero. - ttl_ct: u32, + n: u32, /// The length, in bytes, of the stored panic message. /// /// This quantity may be less than the amount received, as it is capped @@ -2098,13 +2100,16 @@ struct HostPanic { #[derive(Encode)] #[ereport(class = "host.btfail", version = 0)] struct HostBootFail { - /// The total number of panics observed this boot cycle. + /// The total number of host boot failures observed by this invocation + /// of host-sp-comms. /// /// This count will wrap, but is guaranteed to never be zero. - ttl_ct: u32, + n: u32, /// The length, in bytes, of the stored panic message. /// /// This quantity may be less than the amount received, as it is capped /// by the available storage space allocated (`MAX_HOST_FAIL_MESSAGE_LEN`). - pan_len: u32, + msglen: u32, + /// The reported reason code for the host boot failure + reason: u8, } From 5e68f88c1dd2d9666699c8b4f6039f063468ca0e Mon Sep 17 00:00:00 2001 From: James Munns Date: Thu, 14 May 2026 16:33:38 -0700 Subject: [PATCH 06/12] Add flash index field as requested --- app/cosmo/base.toml | 2 +- app/gimlet/base.toml | 2 +- task/host-sp-comms/src/main.rs | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/app/cosmo/base.toml b/app/cosmo/base.toml index acc793a9a..5dd1d635a 100644 --- a/app/cosmo/base.toml +++ b/app/cosmo/base.toml @@ -258,7 +258,7 @@ features = ["stm32h753", "usart6", "baud_rate_3M", "hardware_flow_control", "vla uses = ["usart6", "dbgmcu"] interrupts = {"usart6.irq" = "usart-irq"} priority = 9 -max-sizes = {flash = 72000, ram = 65536} +max-sizes = {flash = 74000, ram = 65536} stacksize = 5400 start = true task-slots = ["sys", { cpu_seq = "cosmo_seq" }, "hf", "control_plane_agent", "net", "packrat", "i2c_driver", { spi_driver = "spi2_driver" }, "sprot", "auxflash"] diff --git a/app/gimlet/base.toml b/app/gimlet/base.toml index 657049a97..3e6fb364d 100644 --- a/app/gimlet/base.toml +++ b/app/gimlet/base.toml @@ -243,7 +243,7 @@ features = ["stm32h753", "uart7", "baud_rate_3M", "hardware_flow_control", "vlan uses = ["uart7", "dbgmcu"] interrupts = {"uart7.irq" = "usart-irq"} priority = 8 -max-sizes = {flash = 72000, ram = 65536} +max-sizes = {flash = 74000, ram = 65536} stacksize = 5376 start = true task-slots = ["sys", { cpu_seq = "gimlet_seq" }, "hf", "control_plane_agent", "net", "packrat", "i2c_driver", { spi_driver = "spi2_driver" }, "sprot"] diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 763f1441c..a9acabeda 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -1027,11 +1027,18 @@ impl ServerImpl { total_count: new_ct, }); + let flashidx = match self.hf.get_dev() { + Ok(HfDevSelect::Flash0) => 0, + Ok(HfDevSelect::Flash1) => 1, + Err(_) => 0xFF, + }; + // ereport! _ = self.ereporter.deliver_ereport(&HostBootFail { n: new_ct, msglen: n as u32, reason, + flashidx, }); Some(SpToHost::Ack) } @@ -1071,9 +1078,17 @@ impl ServerImpl { total_count: new_ct, }); + let flashidx = match self.hf.get_dev() { + Ok(HfDevSelect::Flash0) => 0, + Ok(HfDevSelect::Flash1) => 1, + Err(_) => 0xFF, + }; + + // ereport! _ = self.ereporter.deliver_ereport(&HostPanic { n: new_ct, msglen: n as u32, + flashidx, }); Some(SpToHost::Ack) @@ -2094,6 +2109,9 @@ struct HostPanic { /// This quantity may be less than the amount received, as it is capped /// by the available storage space allocated (`MAX_HOST_FAIL_MESSAGE_LEN`). msglen: u32, + /// The flash boot index, directly correlated to which boot slot we are + /// operating from. Currently 0 (BSU: A), 1 (BSU: B), or 0xFF (unknown). + flashidx: u8, } /// An ereport represent a host reported boot failure @@ -2112,4 +2130,7 @@ struct HostBootFail { msglen: u32, /// The reported reason code for the host boot failure reason: u8, + /// The flash boot index, directly correlated to which boot slot we are + /// operating from. Currently 0 (BSU: A), 1 (BSU: B), or 0xFF (unknown). + flashidx: u8, } From 20879b0fcd6fbfe6c4ff3b9ee1cab7c1db2d5fd4 Mon Sep 17 00:00:00 2001 From: James Munns Date: Fri, 15 May 2026 14:05:16 -0700 Subject: [PATCH 07/12] Working on packrat --- idl/packrat.idol | 27 ++++++++++++++++++++ task/packrat/src/main.rs | 53 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/idl/packrat.idol b/idl/packrat.idol index c8bd47691..f0261cd1a 100644 --- a/idl/packrat.idol +++ b/idl/packrat.idol @@ -144,5 +144,32 @@ Interface( ), idempotent: true, ), + "write_host_bootfail": ( + doc: "Write a host's boot failure message and return the index of this failure", + args: { + }, + leases: { + "data": (type: "[u8]", read: true), + }, + reply: Result( + ok: "u32", + err: CLike("EreportReadError"), + ), + ), + "read_host_bootfail_fragment": ( + doc: "Read a portion of the host's boot failure message", + args: { + "index": "u32", + "offset": "usize", + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "usize", + err: CLike("EreportReadError"), + ), + idempotent: true, + ), }, ) diff --git a/task/packrat/src/main.rs b/task/packrat/src/main.rs index 6cf790b10..c18185612 100644 --- a/task/packrat/src/main.rs +++ b/task/packrat/src/main.rs @@ -154,12 +154,45 @@ enum TraceSet { AttemptedSetToNewValue(T), } +/// Metadata about panics observed from the host +struct HostPanicMetadata { + /// Length in bytes of the currently stored panic message + /// (not currently used, will be used in https://github.com/oxidecomputer/hubris/issues/2504) + _total_length: usize, + /// (hopefully not) Rolling counter of panic messages observed this power cycle + total_count: u32, +} + +pub struct HostInfo { + host_panic_payload: [u8; 4096], + host_bootfail_payload: [u8; 4096], + host_panic_state: Option, + host_bootfail_state: Option, +} + +impl HostInfo { + const fn new() -> Self { + Self { + host_panic_payload: [0u8; _], + host_bootfail_payload: [0u8; _], + host_panic_state: None, + host_bootfail_state: None, + } + } +} + ringbuf!(Trace, 16, Trace::None); #[unsafe(export_name = "main")] fn main() -> ! { struct StaticBufs { mac_address_block: Option, identity: Option, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + host_info: HostInfo, #[cfg(feature = "gimlet")] gimlet_bufs: gimlet::StaticBufs, #[cfg(feature = "cosmo")] @@ -170,6 +203,12 @@ fn main() -> ! { let &mut StaticBufs { ref mut mac_address_block, ref mut identity, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + ref mut host_info, #[cfg(feature = "gimlet")] ref mut gimlet_bufs, #[cfg(feature = "cosmo")] @@ -181,6 +220,12 @@ fn main() -> ! { ClaimOnceCell::new(StaticBufs { mac_address_block: None, identity: None, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + host_info: HostInfo::new(), #[cfg(feature = "gimlet")] gimlet_bufs: gimlet::StaticBufs::new(), #[cfg(feature = "cosmo")] @@ -194,6 +239,12 @@ fn main() -> ! { let mut server = ServerImpl { mac_address_block, identity, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + host_info, #[cfg(feature = "gimlet")] gimlet_data: gimlet::GimletData::new(gimlet_bufs), #[cfg(feature = "grapefruit")] @@ -213,6 +264,8 @@ fn main() -> ! { struct ServerImpl { mac_address_block: &'static mut Option, identity: &'static mut Option, + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + host_info: &'static mut HostInfo, #[cfg(feature = "gimlet")] gimlet_data: gimlet::GimletData, #[cfg(feature = "grapefruit")] From bc62fe9fc52eb3b7a98de73d15111e2188e85a2c Mon Sep 17 00:00:00 2001 From: James Munns Date: Mon, 18 May 2026 15:04:20 +0200 Subject: [PATCH 08/12] Implement bootfail APIs in packrat --- idl/packrat.idol | 9 ++- task/packrat-api/src/lib.rs | 54 +++++++++++++ task/packrat/src/main.rs | 147 ++++++++++++++++++++++++++++++++++-- 3 files changed, 201 insertions(+), 9 deletions(-) diff --git a/idl/packrat.idol b/idl/packrat.idol index f0261cd1a..603065ae2 100644 --- a/idl/packrat.idol +++ b/idl/packrat.idol @@ -147,13 +147,14 @@ Interface( "write_host_bootfail": ( doc: "Write a host's boot failure message and return the index of this failure", args: { + "reason": "u8", }, leases: { "data": (type: "[u8]", read: true), }, reply: Result( - ok: "u32", - err: CLike("EreportReadError"), + ok: "HostInfoWriteOutput", + err: CLike("HostInfoWriteError"), ), ), "read_host_bootfail_fragment": ( @@ -166,8 +167,8 @@ Interface( "data": (type: "[u8]", write: true), }, reply: Result( - ok: "usize", - err: CLike("EreportReadError"), + ok: "HostInfoReadOutput", + err: CLike("HostInfoReadError"), ), idempotent: true, ), diff --git a/task/packrat-api/src/lib.rs b/task/packrat-api/src/lib.rs index ee6f0d6da..796f93cd3 100644 --- a/task/packrat-api/src/lib.rs +++ b/task/packrat-api/src/lib.rs @@ -72,6 +72,60 @@ pub enum EreportWriteError { Lost = 1, } +#[derive( + Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, IdolError, counters::Count, +)] +pub enum HostInfoWriteError { + /// Storage of Host Info is not supported (this is a device not expected to + /// have a host) + Unsupported = 1, + + /// Failed to read from a provided lease, which probably means that the client + /// died before we serviced the request, so they'll probably never see this + LeaseLost, + + #[idol(server_death)] + ServerRestarted, +} + +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable)] +#[repr(C)] +pub struct HostInfoWriteOutput { + pub index: u32, + pub written: usize, +} + +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable)] +#[repr(C)] +pub struct HostInfoReadOutput { + pub read: usize, + pub reason: u8, + pub _pad: [u8; 3], +} + +#[derive( + Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, IdolError, counters::Count, +)] +pub enum HostInfoReadError { + /// We have never received the requested Host Info, or this is a platform + /// that is not expected to have a host. + NoHostInfo = 1, + + /// The requested byte-offset is beyond the range of the currently stored + /// host info. + InvalidOffset, + + /// Requested index does not match the currently stored host information + InvalidIndex, + + /// Failed to write to a provided lease, which probably means that the client + /// died before we serviced the request, so they'll probably never see this + LeaseLost, + + #[idol(server_death)] + ServerRestarted, +} + /// Errors returned by [`Packrat::encode_ereport`]. #[derive(counters::Count)] #[cfg(feature = "microcbor")] diff --git a/task/packrat/src/main.rs b/task/packrat/src/main.rs index c18185612..f2361eb07 100644 --- a/task/packrat/src/main.rs +++ b/task/packrat/src/main.rs @@ -70,7 +70,8 @@ use ringbuf::{ringbuf, ringbuf_entry}; use static_cell::ClaimOnceCell; use task_packrat_api::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostStartupOptions, MacAddressBlock, OxideIdentity, + HostInfoReadError, HostInfoReadOutput, HostInfoWriteError, + HostInfoWriteOutput, HostStartupOptions, MacAddressBlock, OxideIdentity, }; use userlib::RecvMessage; @@ -157,17 +158,26 @@ enum TraceSet { /// Metadata about panics observed from the host struct HostPanicMetadata { /// Length in bytes of the currently stored panic message - /// (not currently used, will be used in https://github.com/oxidecomputer/hubris/issues/2504) - _total_length: usize, + total_length: usize, /// (hopefully not) Rolling counter of panic messages observed this power cycle total_count: u32, } +/// Metadata about panics observed from the host +struct HostBootFailMetadata { + /// Length in bytes of the currently stored bootfail message + total_length: usize, + /// (hopefully not) Rolling counter of panic messages observed this power cycle + total_count: u32, + /// Bootfail reason + reason: u8, +} + pub struct HostInfo { host_panic_payload: [u8; 4096], host_bootfail_payload: [u8; 4096], host_panic_state: Option, - host_bootfail_state: Option, + host_bootfail_state: Option, } impl HostInfo { @@ -622,6 +632,131 @@ impl idl::InOrderPackratImpl for ServerImpl { self.identity.as_ref(), ) } + + /// We're not a system that is expected to have a host, so we shouldn't have + /// anyone writing host bootfail messages to us + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn write_host_bootfail( + &mut self, + _msg: &userlib::RecvMessage, + _data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + Err(HostInfoWriteError::Unsupported.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn write_host_bootfail( + &mut self, + _msg: &userlib::RecvMessage, + reason: u8, + data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + // First, attempt to copy-in the new data, to ensure that we don't rev the + // metadata if the lease access fails. + let to_copy = + self.host_info.host_bootfail_payload.len().min(data.len()); + data.read_range( + 0..to_copy, + &mut self.host_info.host_bootfail_payload[..to_copy], + ) + .map_err(|_| HostInfoWriteError::LeaseLost)?; + + // Okay! We've written the requested data. Let's update the metadata. + // + // Take the old count, if any, and add one to it. If that count wrapped, + // or if we didn't have an old count, set it to 1, so we never return + // a count of zero if we've ever observed a boot failure. + let new_ct = self + .host_info + .host_bootfail_state + .take() + .map(|s| s.total_count.wrapping_add(1)) + .unwrap_or(0) + .max(1); + self.host_info.host_bootfail_state = Some(HostBootFailMetadata { + total_length: to_copy, + total_count: new_ct, + reason, + }); + + // Give the writer the current index and the number of bytes actually written + Ok(HostInfoWriteOutput { + index: new_ct, + written: to_copy, + }) + } + + /// We're not a system that is expected to have a host, therefore we can always return + /// "no host info", since we won't ever have any. + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_host_bootfail_fragment( + &self, + _msg: &userlib::RecvMessage, + _index: u32, + _offset: usize, + _data: Leased, + ) -> Result> + { + Err(HostInfoReadError::NoHostInfo.into()) + } + + /// Attempt to obtain the requested host info. + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_host_bootfail_fragment( + &mut self, + _msg: &userlib::RecvMessage, + index: u32, + offset: usize, + data: Leased, + ) -> Result> + { + // Do we *have* a bootfail to report? + let Some(bfs) = self.host_info.host_bootfail_state.as_ref() else { + return Err(HostInfoReadError::NoHostInfo.into()); + }; + + // Do we have the specific bootfail data being requested? + if bfs.total_count != index { + return Err(HostInfoReadError::InvalidIndex.into()); + } + + // Is the offset requested valid? + let offset_max = bfs + .total_length + .min(self.host_info.host_bootfail_payload.len()); + if offset >= offset_max { + return Err(HostInfoReadError::InvalidOffset.into()); + } + + // Attempt to copy the requested range into the destination + let relevant = &self.host_info.host_bootfail_payload[offset_max..]; + let max_to_copy = data.len().min(relevant.len()); + data.write_range(0..max_to_copy, &relevant[..max_to_copy]) + .map_err(|_| HostInfoReadError::LeaseLost)?; + + let out = HostInfoReadOutput { + read: max_to_copy, + reason: bfs.reason, + _pad: [0u8; _], + }; + + // Okay! Written! Return how many bytes were actually copied + Ok(out) + } } // If we are not built with ereport support, we expect no notifications. @@ -657,7 +792,9 @@ impl NotificationHandler for ServerImpl { mod idl { use super::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostStartupOptions, MacAddressBlock, OxideIdentity, ereport_messages, + HostInfoReadError, HostInfoReadOutput, HostInfoWriteError, + HostInfoWriteOutput, HostStartupOptions, MacAddressBlock, + OxideIdentity, ereport_messages, }; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); From 3e2b6557e456b286d8f69de643d60bd36eb9806d Mon Sep 17 00:00:00 2001 From: James Munns Date: Mon, 18 May 2026 15:56:24 +0200 Subject: [PATCH 09/12] Plumb panic handling from the host-sp-comms side --- idl/packrat.idol | 29 +++++++- task/host-sp-comms/src/main.rs | 111 +++++++--------------------- task/packrat-api/src/lib.rs | 2 +- task/packrat/src/main.rs | 131 +++++++++++++++++++++++++++++++-- 4 files changed, 178 insertions(+), 95 deletions(-) diff --git a/idl/packrat.idol b/idl/packrat.idol index 603065ae2..c1080bd11 100644 --- a/idl/packrat.idol +++ b/idl/packrat.idol @@ -167,7 +167,34 @@ Interface( "data": (type: "[u8]", write: true), }, reply: Result( - ok: "HostInfoReadOutput", + ok: "HostBootfailReadOutput", + err: CLike("HostInfoReadError"), + ), + idempotent: true, + ), + "write_host_panic": ( + doc: "Write a host's panic message and return the index of this panic", + args: { + }, + leases: { + "data": (type: "[u8]", read: true), + }, + reply: Result( + ok: "HostInfoWriteOutput", + err: CLike("HostInfoWriteError"), + ), + ), + "read_host_panic_fragment": ( + doc: "Read a portion of the host's panic message", + args: { + "index": "u32", + "offset": "usize", + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "usize", err: CLike("HostInfoReadError"), ), idempotent: true, diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index a9acabeda..451ed8daf 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -91,9 +91,6 @@ const A2_REBOOT_DELAY: u64 = 5_000; // response to send, and we haven't yet started to receive a request). const UART_ZERO_DELAY: u64 = 200; -// How long of a host panic / boot fail message are we willing to keep? -const MAX_HOST_FAIL_MESSAGE_LEN: usize = 4096; - // How many MAC addresses should we report to the host? Per RFD 320, a gimlet // currently needs 5 total: // @@ -234,9 +231,6 @@ const MAX_DTRACE_CONF_LEN: usize = 4096; // data for later read back (either by the host itself or by the control plane // via MGS). struct HostKeyValueStorage { - last_boot_fail_reason: u8, - last_boot_fail: &'static mut [u8; MAX_HOST_FAIL_MESSAGE_LEN], - last_panic: &'static mut [u8; MAX_HOST_FAIL_MESSAGE_LEN], etc_system: &'static mut [u8; MAX_ETC_SYSTEM_LEN], etc_system_len: usize, dtrace_conf: &'static mut [u8; MAX_DTRACE_CONF_LEN], @@ -276,15 +270,6 @@ impl HostKeyValueStorage { } } -/// Metadata about panics observed from the host -struct HostPanicMetadata { - /// Length in bytes of the currently stored panic message - /// (not currently used, will be used in https://github.com/oxidecomputer/hubris/issues/2504) - _total_length: usize, - /// (hopefully not) Rolling counter of panic messages observed this power cycle - total_count: u32, -} - struct ServerImpl { uart: Usart, sys: sys_api::Sys, @@ -303,8 +288,6 @@ struct ServerImpl { hf_mux_state: Option, ereporter: Ereporter, - host_panic_state: Option, - host_boot_fail_state: Option, /// Temporary space for inventory data, which is a large `enum` scratch: &'static mut host_sp_messages::InventoryData, @@ -345,8 +328,6 @@ impl ServerImpl { struct Bufs { tx_buf: tx_buf::StaticBufs, rx_buf: Vec, - last_boot_fail: [u8; MAX_HOST_FAIL_MESSAGE_LEN], - last_panic: [u8; MAX_HOST_FAIL_MESSAGE_LEN], etc_system: [u8; MAX_ETC_SYSTEM_LEN], dtrace_conf: [u8; MAX_DTRACE_CONF_LEN], scratch: host_sp_messages::InventoryData, @@ -360,8 +341,6 @@ impl ServerImpl { let &mut Bufs { ref mut tx_buf, ref mut rx_buf, - ref mut last_boot_fail, - ref mut last_panic, ref mut etc_system, ref mut dtrace_conf, ref mut scratch, @@ -375,8 +354,6 @@ impl ServerImpl { static BUFS: ClaimOnceCell = ClaimOnceCell::new(Bufs { tx_buf: tx_buf::StaticBufs::new(), rx_buf: Vec::new(), - last_boot_fail: [0; MAX_HOST_FAIL_MESSAGE_LEN], - last_panic: [0; MAX_HOST_FAIL_MESSAGE_LEN], etc_system: [0; MAX_ETC_SYSTEM_LEN], dtrace_conf: [0; MAX_DTRACE_CONF_LEN], #[cfg(not(any( @@ -420,9 +397,6 @@ impl ServerImpl { sprot: SpRot::from(SPROT.get_task_id()), reboot_state: None, host_kv_storage: HostKeyValueStorage { - last_boot_fail_reason: 0, - last_boot_fail, - last_panic, etc_system, etc_system_len: 0, dtrace_conf, @@ -431,8 +405,6 @@ impl ServerImpl { hf_mux_state: None, last_power_off: None, scratch, - host_panic_state: None, - host_boot_fail_state: None, ereporter: Ereporter::claim_static_resources(packrat), } } @@ -998,35 +970,19 @@ impl ServerImpl { // Indicate that the host boot failed, so that we can then tell // sequencer why we are asking it to power off the system. self.last_power_off = Some(StateChangeReason::HostBootFailure); - // TODO forward to MGS - // - // For now, copy it into a static var we can pull out via - // `humility host boot-fail`. - let n = usize::min( - data.len(), - self.host_kv_storage.last_boot_fail.len(), - ); - let (now, later) = - self.host_kv_storage.last_boot_fail.split_at_mut(n); - now.copy_from_slice(&data[..n]); - later.iter_mut().for_each(|b| *b = 0); - - self.host_kv_storage.last_boot_fail_reason = reason; - - // Take the old count, if any, and add one to it. If that count wrapped, - // or if we didn't have an old count, set it to 1, so we never return - // a count of zero if we've ever observed a boot failure. - let new_ct = self - .host_boot_fail_state - .take() - .map(|s| s.total_count.wrapping_add(1)) - .unwrap_or(0) - .max(1); - self.host_boot_fail_state = Some(HostPanicMetadata { - _total_length: n, - total_count: new_ct, - }); + // Store the bootfail message in packrat so it can be accessed by MGS in the future + // TODO: What to do if this call fails? Without it, we don't have a proper index, but + // this would only happen if packrat crashes. We could store some fake info here to + // continue preparing an ereport, but THAT is going to be a problem anyway because + // we send ereports to, you guessed it: packrat, which just crashed. + let response = self + .packrat + .write_host_bootfail(reason, data) + .unwrap_lite(); + + // TODO: Do we want to give this to packrat too? + // TODO: Update `humility host boot-fail` to use packrat API! let flashidx = match self.hf.get_dev() { Ok(HfDevSelect::Flash0) => 0, Ok(HfDevSelect::Flash1) => 1, @@ -1035,8 +991,8 @@ impl ServerImpl { // ereport! _ = self.ereporter.deliver_ereport(&HostBootFail { - n: new_ct, - msglen: n as u32, + n: response.index, + msglen: response.written as u32, reason, flashidx, }); @@ -1051,33 +1007,16 @@ impl ServerImpl { self.last_power_off = Some(StateChangeReason::HostPanic); } - // TODO forward to MGS - // - // For now, copy it into a static var we can pull out via - // `humility host last-panic`. - let n = usize::min( - data.len(), - self.host_kv_storage.last_panic.len(), - ); - let (now, later) = - self.host_kv_storage.last_panic.split_at_mut(n); - now.copy_from_slice(&data[..n]); - later.iter_mut().for_each(|b| *b = 0); - - // Take the old count, if any, and add one to it. If that count wrapped, - // or if we didn't have an old count, set it to 1, so we never return - // a count of zero if we've ever observed a boot failure. - let new_ct = self - .host_panic_state - .take() - .map(|s| s.total_count.wrapping_add(1)) - .unwrap_or(0) - .max(1); - self.host_panic_state = Some(HostPanicMetadata { - _total_length: n, - total_count: new_ct, - }); + // Store the panic message in packrat so it can be accessed by MGS in the future + // TODO: What to do if this call fails? Without it, we don't have a proper index, but + // this would only happen if packrat crashes. We could store some fake info here to + // continue preparing an ereport, but THAT is going to be a problem anyway because + // we send ereports to, you guessed it: packrat, which just crashed. + let response = + self.packrat.write_host_panic(data).unwrap_lite(); + // TODO: Do we want to give this to packrat too? + // TODO: Update `humility host last-panic` to use packrat API! let flashidx = match self.hf.get_dev() { Ok(HfDevSelect::Flash0) => 0, Ok(HfDevSelect::Flash1) => 1, @@ -1086,8 +1025,8 @@ impl ServerImpl { // ereport! _ = self.ereporter.deliver_ereport(&HostPanic { - n: new_ct, - msglen: n as u32, + n: response.index, + msglen: response.written as u32, flashidx, }); diff --git a/task/packrat-api/src/lib.rs b/task/packrat-api/src/lib.rs index 796f93cd3..679a432da 100644 --- a/task/packrat-api/src/lib.rs +++ b/task/packrat-api/src/lib.rs @@ -97,7 +97,7 @@ pub struct HostInfoWriteOutput { #[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable)] #[repr(C)] -pub struct HostInfoReadOutput { +pub struct HostBootfailReadOutput { pub read: usize, pub reason: u8, pub _pad: [u8; 3], diff --git a/task/packrat/src/main.rs b/task/packrat/src/main.rs index f2361eb07..901203728 100644 --- a/task/packrat/src/main.rs +++ b/task/packrat/src/main.rs @@ -70,7 +70,7 @@ use ringbuf::{ringbuf, ringbuf_entry}; use static_cell::ClaimOnceCell; use task_packrat_api::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostInfoReadError, HostInfoReadOutput, HostInfoWriteError, + HostBootfailReadOutput, HostInfoReadError, HostInfoWriteError, HostInfoWriteOutput, HostStartupOptions, MacAddressBlock, OxideIdentity, }; use userlib::RecvMessage; @@ -709,8 +709,10 @@ impl idl::InOrderPackratImpl for ServerImpl { _index: u32, _offset: usize, _data: Leased, - ) -> Result> - { + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { Err(HostInfoReadError::NoHostInfo.into()) } @@ -722,8 +724,10 @@ impl idl::InOrderPackratImpl for ServerImpl { index: u32, offset: usize, data: Leased, - ) -> Result> - { + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { // Do we *have* a bootfail to report? let Some(bfs) = self.host_info.host_bootfail_state.as_ref() else { return Err(HostInfoReadError::NoHostInfo.into()); @@ -748,7 +752,7 @@ impl idl::InOrderPackratImpl for ServerImpl { data.write_range(0..max_to_copy, &relevant[..max_to_copy]) .map_err(|_| HostInfoReadError::LeaseLost)?; - let out = HostInfoReadOutput { + let out = HostBootfailReadOutput { read: max_to_copy, reason: bfs.reason, _pad: [0u8; _], @@ -757,6 +761,119 @@ impl idl::InOrderPackratImpl for ServerImpl { // Okay! Written! Return how many bytes were actually copied Ok(out) } + + /// We're not a system that is expected to have a host, therefore we can always return + /// "no host info", since we won't ever have any. + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn write_host_panic( + &mut self, + _msg: &userlib::RecvMessage, + _data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + Err(HostInfoWriteError::Unsupported.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn write_host_panic( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + // First, attempt to copy-in the new data, to ensure that we don't rev the + // metadata if the lease access fails. + let to_copy = self.host_info.host_panic_payload.len().min(data.len()); + data.read_range( + 0..to_copy, + &mut self.host_info.host_panic_payload[..to_copy], + ) + .map_err(|_| HostInfoWriteError::LeaseLost)?; + + // Okay! We've written the requested data. Let's update the metadata. + // + // Take the old count, if any, and add one to it. If that count wrapped, + // or if we didn't have an old count, set it to 1, so we never return + // a count of zero if we've ever observed a panic. + let new_ct = self + .host_info + .host_panic_state + .take() + .map(|s| s.total_count.wrapping_add(1)) + .unwrap_or(0) + .max(1); + self.host_info.host_panic_state = Some(HostPanicMetadata { + total_length: to_copy, + total_count: new_ct, + }); + + // Give the writer the current index and the number of bytes actually written + Ok(HostInfoWriteOutput { + index: new_ct, + written: to_copy, + }) + } + + /// We're not a system that is expected to have a host, therefore we can always return + /// "no host info", since we won't ever have any. + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, + _index: u32, + _offset: usize, + _data: Leased, + ) -> Result> { + Err(HostInfoReadError::NoHostInfo.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, + index: u32, + offset: usize, + data: Leased, + ) -> Result> { + // Do we *have* a panic to report? + let Some(bfs) = self.host_info.host_panic_state.as_ref() else { + return Err(HostInfoReadError::NoHostInfo.into()); + }; + + // Do we have the specific panic data being requested? + if bfs.total_count != index { + return Err(HostInfoReadError::InvalidIndex.into()); + } + + // Is the offset requested valid? + let offset_max = bfs + .total_length + .min(self.host_info.host_panic_payload.len()); + if offset >= offset_max { + return Err(HostInfoReadError::InvalidOffset.into()); + } + + // Attempt to copy the requested range into the destination + let relevant = &self.host_info.host_panic_payload[offset_max..]; + let max_to_copy = data.len().min(relevant.len()); + data.write_range(0..max_to_copy, &relevant[..max_to_copy]) + .map_err(|_| HostInfoReadError::LeaseLost)?; + + // Okay! Written! Return how many bytes were actually copied + Ok(max_to_copy) + } } // If we are not built with ereport support, we expect no notifications. @@ -792,7 +909,7 @@ impl NotificationHandler for ServerImpl { mod idl { use super::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostInfoReadError, HostInfoReadOutput, HostInfoWriteError, + HostBootfailReadOutput, HostInfoReadError, HostInfoWriteError, HostInfoWriteOutput, HostStartupOptions, MacAddressBlock, OxideIdentity, ereport_messages, }; From 42e09658fdb788707a6c45be7fc929a94626f9a5 Mon Sep 17 00:00:00 2001 From: James Munns Date: Tue, 19 May 2026 10:24:46 +0200 Subject: [PATCH 10/12] Implement suggestions from @hawkw --- idl/packrat.idol | 4 ++-- task/packrat-api/src/lib.rs | 20 -------------------- task/packrat/src/main.rs | 35 ++++++++++++++++++----------------- 3 files changed, 20 insertions(+), 39 deletions(-) diff --git a/idl/packrat.idol b/idl/packrat.idol index c1080bd11..f4abc10a9 100644 --- a/idl/packrat.idol +++ b/idl/packrat.idol @@ -154,7 +154,7 @@ Interface( }, reply: Result( ok: "HostInfoWriteOutput", - err: CLike("HostInfoWriteError"), + err: ServerDeath, ), ), "read_host_bootfail_fragment": ( @@ -181,7 +181,7 @@ Interface( }, reply: Result( ok: "HostInfoWriteOutput", - err: CLike("HostInfoWriteError"), + err: ServerDeath, ), ), "read_host_panic_fragment": ( diff --git a/task/packrat-api/src/lib.rs b/task/packrat-api/src/lib.rs index 679a432da..d63771cfb 100644 --- a/task/packrat-api/src/lib.rs +++ b/task/packrat-api/src/lib.rs @@ -72,22 +72,6 @@ pub enum EreportWriteError { Lost = 1, } -#[derive( - Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, IdolError, counters::Count, -)] -pub enum HostInfoWriteError { - /// Storage of Host Info is not supported (this is a device not expected to - /// have a host) - Unsupported = 1, - - /// Failed to read from a provided lease, which probably means that the client - /// died before we serviced the request, so they'll probably never see this - LeaseLost, - - #[idol(server_death)] - ServerRestarted, -} - #[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable)] #[repr(C)] pub struct HostInfoWriteOutput { @@ -118,10 +102,6 @@ pub enum HostInfoReadError { /// Requested index does not match the currently stored host information InvalidIndex, - /// Failed to write to a provided lease, which probably means that the client - /// died before we serviced the request, so they'll probably never see this - LeaseLost, - #[idol(server_death)] ServerRestarted, } diff --git a/task/packrat/src/main.rs b/task/packrat/src/main.rs index 901203728..29ac6bf8a 100644 --- a/task/packrat/src/main.rs +++ b/task/packrat/src/main.rs @@ -65,13 +65,15 @@ use core::convert::Infallible; use gateway_ereport_messages as ereport_messages; -use idol_runtime::{Leased, LenLimit, NotificationHandler, RequestError}; +use idol_runtime::{ + ClientError, Leased, LenLimit, NotificationHandler, RequestError, +}; use ringbuf::{ringbuf, ringbuf_entry}; use static_cell::ClaimOnceCell; use task_packrat_api::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostBootfailReadOutput, HostInfoReadError, HostInfoWriteError, - HostInfoWriteOutput, HostStartupOptions, MacAddressBlock, OxideIdentity, + HostBootfailReadOutput, HostInfoReadError, HostInfoWriteOutput, + HostStartupOptions, MacAddressBlock, OxideIdentity, }; use userlib::RecvMessage; @@ -646,9 +648,9 @@ impl idl::InOrderPackratImpl for ServerImpl { _data: Leased, ) -> Result< HostInfoWriteOutput, - idol_runtime::RequestError, + idol_runtime::RequestError, > { - Err(HostInfoWriteError::Unsupported.into()) + Err(idol_runtime::ClientError::UnknownOperation.fail()) } #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] @@ -659,7 +661,7 @@ impl idl::InOrderPackratImpl for ServerImpl { data: Leased, ) -> Result< HostInfoWriteOutput, - idol_runtime::RequestError, + idol_runtime::RequestError, > { // First, attempt to copy-in the new data, to ensure that we don't rev the // metadata if the lease access fails. @@ -669,7 +671,7 @@ impl idl::InOrderPackratImpl for ServerImpl { 0..to_copy, &mut self.host_info.host_bootfail_payload[..to_copy], ) - .map_err(|_| HostInfoWriteError::LeaseLost)?; + .map_err(|_| ClientError::WentAway.fail())?; // Okay! We've written the requested data. Let's update the metadata. // @@ -704,7 +706,7 @@ impl idl::InOrderPackratImpl for ServerImpl { feature = "cosmo" )))] fn read_host_bootfail_fragment( - &self, + &mut self, _msg: &userlib::RecvMessage, _index: u32, _offset: usize, @@ -750,7 +752,7 @@ impl idl::InOrderPackratImpl for ServerImpl { let relevant = &self.host_info.host_bootfail_payload[offset_max..]; let max_to_copy = data.len().min(relevant.len()); data.write_range(0..max_to_copy, &relevant[..max_to_copy]) - .map_err(|_| HostInfoReadError::LeaseLost)?; + .map_err(|_| ClientError::WentAway.fail())?; let out = HostBootfailReadOutput { read: max_to_copy, @@ -775,9 +777,9 @@ impl idl::InOrderPackratImpl for ServerImpl { _data: Leased, ) -> Result< HostInfoWriteOutput, - idol_runtime::RequestError, + idol_runtime::RequestError, > { - Err(HostInfoWriteError::Unsupported.into()) + Err(idol_runtime::ClientError::UnknownOperation.fail()) } #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] @@ -787,7 +789,7 @@ impl idl::InOrderPackratImpl for ServerImpl { data: Leased, ) -> Result< HostInfoWriteOutput, - idol_runtime::RequestError, + idol_runtime::RequestError, > { // First, attempt to copy-in the new data, to ensure that we don't rev the // metadata if the lease access fails. @@ -796,7 +798,7 @@ impl idl::InOrderPackratImpl for ServerImpl { 0..to_copy, &mut self.host_info.host_panic_payload[..to_copy], ) - .map_err(|_| HostInfoWriteError::LeaseLost)?; + .map_err(|_| ClientError::WentAway.fail())?; // Okay! We've written the requested data. Let's update the metadata. // @@ -869,7 +871,7 @@ impl idl::InOrderPackratImpl for ServerImpl { let relevant = &self.host_info.host_panic_payload[offset_max..]; let max_to_copy = data.len().min(relevant.len()); data.write_range(0..max_to_copy, &relevant[..max_to_copy]) - .map_err(|_| HostInfoReadError::LeaseLost)?; + .map_err(|_| ClientError::WentAway.fail())?; // Okay! Written! Return how many bytes were actually copied Ok(max_to_copy) @@ -909,9 +911,8 @@ impl NotificationHandler for ServerImpl { mod idl { use super::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostBootfailReadOutput, HostInfoReadError, HostInfoWriteError, - HostInfoWriteOutput, HostStartupOptions, MacAddressBlock, - OxideIdentity, ereport_messages, + HostBootfailReadOutput, HostInfoReadError, HostInfoWriteOutput, + HostStartupOptions, MacAddressBlock, OxideIdentity, ereport_messages, }; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); From b2deca6579b03f91a44fc94161a453cef395b55b Mon Sep 17 00:00:00 2001 From: James Munns Date: Tue, 19 May 2026 11:46:24 +0200 Subject: [PATCH 11/12] Add missing argument --- task/packrat/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/task/packrat/src/main.rs b/task/packrat/src/main.rs index 29ac6bf8a..67599a83d 100644 --- a/task/packrat/src/main.rs +++ b/task/packrat/src/main.rs @@ -645,6 +645,7 @@ impl idl::InOrderPackratImpl for ServerImpl { fn write_host_bootfail( &mut self, _msg: &userlib::RecvMessage, + _reason: u8, _data: Leased, ) -> Result< HostInfoWriteOutput, From 1f75e171661676600e1f6845637a707a7bea7c8f Mon Sep 17 00:00:00 2001 From: James Munns Date: Tue, 19 May 2026 16:09:11 +0200 Subject: [PATCH 12/12] Plumb MGS requests to packrat IPC calls --- Cargo.lock | 2 +- Cargo.toml | 2 +- app/cosmo/base.toml | 2 +- idl/packrat.idol | 34 ++- .../src/mgs_compute_sled.rs | 98 +++++++- task/packrat-api/src/lib.rs | 19 ++ task/packrat/src/main.rs | 221 +++++++++++++----- 7 files changed, 311 insertions(+), 67 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e1a79b16..b38eb13a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3131,7 +3131,7 @@ dependencies = [ [[package]] name = "gateway-messages" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/management-gateway-service#177c9c719e12896c566a1b6b5416c9bc686531d3" +source = "git+https://github.com/oxidecomputer/management-gateway-service?branch=james%2Fhost-fails#7b4d6b5a8f394d26ccf90a17911ab76e3807113c" dependencies = [ "bitflags 2.9.4", "hubpack", diff --git a/Cargo.toml b/Cargo.toml index 5c5d47e35..bfb760aa5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,7 +152,7 @@ apob = { git = "https://github.com/oxidecomputer/apob", default-features = false # for the migration. attest-data = { git = "https://github.com/oxidecomputer/dice-util", default-features = false, version = "0.4.0", rev = "a0811d06c75c757a6e12c91ed6ea81fde137ba43" } dice-mfg-msgs = { git = "https://github.com/oxidecomputer/dice-util", default-features = false, version = "0.2.1", rev = "a0811d06c75c757a6e12c91ed6ea81fde137ba43" } -gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", default-features = false, features = ["smoltcp"] } +gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", default-features = false, features = ["smoltcp"], branch = "james/host-fails" } gateway-ereport-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", default-features = false } gimlet-inspector-protocol = { git = "https://github.com/oxidecomputer/gimlet-inspector-protocol", version = "0.1.0" } hif = { git = "https://github.com/oxidecomputer/hif", default-features = false } diff --git a/app/cosmo/base.toml b/app/cosmo/base.toml index 5dd1d635a..ffa827fcf 100644 --- a/app/cosmo/base.toml +++ b/app/cosmo/base.toml @@ -132,7 +132,7 @@ notifications = ["i2c1-irq", "i2c2-irq", "i2c3-irq", "i2c4-irq"] [tasks.packrat] name = "task-packrat" priority = 1 -stacksize = 1400 +stacksize = 1600 start = true task-slots = ["jefe"] features = ["cosmo", "ereport"] diff --git a/idl/packrat.idol b/idl/packrat.idol index f4abc10a9..a262c25a9 100644 --- a/idl/packrat.idol +++ b/idl/packrat.idol @@ -157,11 +157,23 @@ Interface( err: ServerDeath, ), ), + "read_first_host_bootfail_fragment": ( + doc: "Read a portion of the host's boot failure message", + args: { + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "HostBootfailReadOutput", + err: CLike("HostInfoReadError"), + ), + idempotent: true, + ), "read_host_bootfail_fragment": ( doc: "Read a portion of the host's boot failure message", args: { - "index": "u32", - "offset": "usize", + "request": "HostInfoRequest", }, leases: { "data": (type: "[u8]", write: true), @@ -184,17 +196,29 @@ Interface( err: ServerDeath, ), ), + "read_first_host_panic_fragment": ( + doc: "Read a portion of the host's panic message", + args: { + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "HostPanicReadOutput", + err: CLike("HostInfoReadError"), + ), + idempotent: true, + ), "read_host_panic_fragment": ( doc: "Read a portion of the host's panic message", args: { - "index": "u32", - "offset": "usize", + "request": "HostInfoRequest", }, leases: { "data": (type: "[u8]", write: true), }, reply: Result( - ok: "usize", + ok: "HostPanicReadOutput", err: CLike("HostInfoReadError"), ), idempotent: true, diff --git a/task/control-plane-agent/src/mgs_compute_sled.rs b/task/control-plane-agent/src/mgs_compute_sled.rs index 503402e01..1876f20bd 100644 --- a/task/control-plane-agent/src/mgs_compute_sled.rs +++ b/task/control-plane-agent/src/mgs_compute_sled.rs @@ -17,7 +17,8 @@ use gateway_messages::sp_impl::{ use gateway_messages::{ ApobComponentAction, ComponentAction, ComponentActionResponse, ComponentDetails, ComponentUpdatePrepare, DiscoverResponse, DumpSegment, - DumpTask, GpioToggleCount, Header, IgnitionCommand, IgnitionState, + DumpTask, GpioToggleCount, Header, HostBootfailPayloadData, + HostInfoRequest, HostPanicPayloadData, IgnitionCommand, IgnitionState, LastPostCode, Message, MessageKind, MgsError, MgsRequest, MgsResponse, PostCode, PowerState, PowerStateTransition, RotBootInfo, RotRequest, RotResponse, SERIAL_CONSOLE_IDLE_TIMEOUT, SensorRequest, SensorResponse, @@ -1259,6 +1260,101 @@ impl SpHandler for MgsHandler { })); self.host_flash_update.get_hash(slot) } + + fn get_host_panic_payload( + &mut self, + request: Option, + len: u32, + trailing_tx_buf: &mut [u8], + ) -> Result { + let max_len_usize = len as usize; + let max_len_usize = max_len_usize.min(trailing_tx_buf.len()); + let dest = &mut trailing_tx_buf[..max_len_usize]; + + let res = if let Some(req) = request { + self.common.packrat().read_host_panic_fragment( + task_packrat_api::HostInfoRequest { + offset: req.offset, + index: req.index, + }, + dest, + ) + } else { + self.common.packrat().read_first_host_panic_fragment(dest) + }; + + let info = res.map_err(|e| { + SpError::HostPanic(match e { + task_packrat_api::HostInfoReadError::NoHostInfo => { + gateway_messages::HostPanicError::NoHostInfo + } + task_packrat_api::HostInfoReadError::InvalidOffset => { + gateway_messages::HostPanicError::InvalidOffset + } + task_packrat_api::HostInfoReadError::InvalidIndex => { + gateway_messages::HostPanicError::InvalidIndex + } + task_packrat_api::HostInfoReadError::ServerRestarted => { + gateway_messages::HostPanicError::ServerRestarted + } + }) + })?; + + Ok(HostPanicPayloadData { + index: info.index, + len: info.read, + total_len: info.total_len as u32, + }) + } + + fn get_host_bootfail_payload( + &mut self, + request: Option, + len: u32, + trailing_tx_buf: &mut [u8], + ) -> Result { + let max_len_usize = len as usize; + let max_len_usize = max_len_usize.min(trailing_tx_buf.len()); + let dest = &mut trailing_tx_buf[..max_len_usize]; + + let res = if let Some(req) = request { + self.common.packrat().read_host_bootfail_fragment( + task_packrat_api::HostInfoRequest { + offset: req.offset, + index: req.index, + }, + dest, + ) + } else { + self.common + .packrat() + .read_first_host_bootfail_fragment(dest) + }; + + let info = res.map_err(|e| { + SpError::HostBootfail(match e { + task_packrat_api::HostInfoReadError::NoHostInfo => { + gateway_messages::HostBootfailError::NoHostInfo + } + task_packrat_api::HostInfoReadError::InvalidOffset => { + gateway_messages::HostBootfailError::InvalidOffset + } + task_packrat_api::HostInfoReadError::InvalidIndex => { + gateway_messages::HostBootfailError::InvalidIndex + } + task_packrat_api::HostInfoReadError::ServerRestarted => { + gateway_messages::HostBootfailError::ServerRestarted + } + }) + })?; + + Ok(HostBootfailPayloadData { + index: info.index, + len: info.read, + total_len: info.total_len as u32, + reason: info.reason, + }) + } } struct UsartHandler { diff --git a/task/packrat-api/src/lib.rs b/task/packrat-api/src/lib.rs index d63771cfb..fa859cda4 100644 --- a/task/packrat-api/src/lib.rs +++ b/task/packrat-api/src/lib.rs @@ -83,10 +83,29 @@ pub struct HostInfoWriteOutput { #[repr(C)] pub struct HostBootfailReadOutput { pub read: usize, + pub offset: usize, + pub total_len: usize, + pub index: u32, pub reason: u8, pub _pad: [u8; 3], } +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable)] +#[repr(C)] +pub struct HostPanicReadOutput { + pub read: usize, + pub offset: usize, + pub total_len: usize, + pub index: u32, +} + +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable, KnownLayout)] +#[repr(C)] +pub struct HostInfoRequest { + pub offset: u32, + pub index: u32, +} + #[derive( Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, IdolError, counters::Count, )] diff --git a/task/packrat/src/main.rs b/task/packrat/src/main.rs index 67599a83d..62c3a38fd 100644 --- a/task/packrat/src/main.rs +++ b/task/packrat/src/main.rs @@ -72,8 +72,9 @@ use ringbuf::{ringbuf, ringbuf_entry}; use static_cell::ClaimOnceCell; use task_packrat_api::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostBootfailReadOutput, HostInfoReadError, HostInfoWriteOutput, - HostStartupOptions, MacAddressBlock, OxideIdentity, + HostBootfailReadOutput, HostInfoReadError, HostInfoRequest, + HostInfoWriteOutput, HostPanicReadOutput, HostStartupOptions, + MacAddressBlock, OxideIdentity, }; use userlib::RecvMessage; @@ -709,8 +710,7 @@ impl idl::InOrderPackratImpl for ServerImpl { fn read_host_bootfail_fragment( &mut self, _msg: &userlib::RecvMessage, - _index: u32, - _offset: usize, + _request: HostInfoRequest, _data: Leased, ) -> Result< HostBootfailReadOutput, @@ -719,50 +719,46 @@ impl idl::InOrderPackratImpl for ServerImpl { Err(HostInfoReadError::NoHostInfo.into()) } + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_first_host_bootfail_fragment( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + Err(HostInfoReadError::NoHostInfo.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_first_host_bootfail_fragment( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + self.host_bootfail_helper(None, data) + } + /// Attempt to obtain the requested host info. #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] fn read_host_bootfail_fragment( &mut self, _msg: &userlib::RecvMessage, - index: u32, - offset: usize, + request: HostInfoRequest, data: Leased, ) -> Result< HostBootfailReadOutput, idol_runtime::RequestError, > { - // Do we *have* a bootfail to report? - let Some(bfs) = self.host_info.host_bootfail_state.as_ref() else { - return Err(HostInfoReadError::NoHostInfo.into()); - }; - - // Do we have the specific bootfail data being requested? - if bfs.total_count != index { - return Err(HostInfoReadError::InvalidIndex.into()); - } - - // Is the offset requested valid? - let offset_max = bfs - .total_length - .min(self.host_info.host_bootfail_payload.len()); - if offset >= offset_max { - return Err(HostInfoReadError::InvalidOffset.into()); - } - - // Attempt to copy the requested range into the destination - let relevant = &self.host_info.host_bootfail_payload[offset_max..]; - let max_to_copy = data.len().min(relevant.len()); - data.write_range(0..max_to_copy, &relevant[..max_to_copy]) - .map_err(|_| ClientError::WentAway.fail())?; - - let out = HostBootfailReadOutput { - read: max_to_copy, - reason: bfs.reason, - _pad: [0u8; _], - }; - - // Okay! Written! Return how many bytes were actually copied - Ok(out) + self.host_bootfail_helper(Some(&request), data) } /// We're not a system that is expected to have a host, therefore we can always return @@ -835,10 +831,12 @@ impl idl::InOrderPackratImpl for ServerImpl { fn read_host_panic_fragment( &mut self, _msg: &userlib::RecvMessage, - _index: u32, - _offset: usize, + _request: HostInfoRequest, _data: Leased, - ) -> Result> { + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { Err(HostInfoReadError::NoHostInfo.into()) } @@ -846,36 +844,142 @@ impl idl::InOrderPackratImpl for ServerImpl { fn read_host_panic_fragment( &mut self, _msg: &userlib::RecvMessage, - index: u32, - offset: usize, + request: HostInfoRequest, + data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + self.host_panic_helper(Some(&request), data) + } + + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_first_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, data: Leased, - ) -> Result> { + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + Err(HostInfoReadError::NoHostInfo.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_first_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + self.host_panic_helper(None, data) + } +} + +impl ServerImpl { + fn host_panic_helper( + &self, + req: Option<&HostInfoRequest>, + data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { // Do we *have* a panic to report? let Some(bfs) = self.host_info.host_panic_state.as_ref() else { return Err(HostInfoReadError::NoHostInfo.into()); }; - // Do we have the specific panic data being requested? - if bfs.total_count != index { - return Err(HostInfoReadError::InvalidIndex.into()); - } - - // Is the offset requested valid? - let offset_max = bfs + let length = bfs .total_length .min(self.host_info.host_panic_payload.len()); - if offset >= offset_max { - return Err(HostInfoReadError::InvalidOffset.into()); - } + let offset = if let Some(req) = req { + // Do we have the specific panic data being requested? + if bfs.total_count != req.index { + return Err(HostInfoReadError::InvalidIndex.into()); + } + + // Is the offset requested valid? + let offset_req = req.offset as usize; + if offset_req >= length { + return Err(HostInfoReadError::InvalidOffset.into()); + } + + offset_req + } else { + 0 + }; + + // Attempt to copy the requested range into the destination + let relevant = &self.host_info.host_panic_payload[offset..]; + let max_to_copy = data.len().min(relevant.len()); + data.write_range(0..max_to_copy, &relevant[..max_to_copy]) + .map_err(|_| ClientError::WentAway.fail())?; + + // Okay! Written! Return how many bytes were actually copied + Ok(HostPanicReadOutput { + read: max_to_copy, + offset: offset, + index: bfs.total_count, + total_len: length, + }) + } + + fn host_bootfail_helper( + &self, + request: Option<&HostInfoRequest>, + data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + // Do we *have* a bootfail to report? + let Some(bfs) = self.host_info.host_bootfail_state.as_ref() else { + return Err(HostInfoReadError::NoHostInfo.into()); + }; + + let length = bfs + .total_length + .min(self.host_info.host_bootfail_payload.len()); + let offset = if let Some(req) = request { + // Do we have the specific bootfail data being requested? + if bfs.total_count != req.index { + return Err(HostInfoReadError::InvalidIndex.into()); + } + + // Is the offset requested valid? + let offset_req = req.offset as usize; + if offset_req >= length { + return Err(HostInfoReadError::InvalidOffset.into()); + } + offset_req + } else { + 0 + }; // Attempt to copy the requested range into the destination - let relevant = &self.host_info.host_panic_payload[offset_max..]; + let relevant = &self.host_info.host_bootfail_payload[offset..]; let max_to_copy = data.len().min(relevant.len()); data.write_range(0..max_to_copy, &relevant[..max_to_copy]) .map_err(|_| ClientError::WentAway.fail())?; + let out = HostBootfailReadOutput { + read: max_to_copy, + reason: bfs.reason, + index: bfs.total_count, + total_len: length, + offset: offset, + _pad: [0u8; _], + }; + // Okay! Written! Return how many bytes were actually copied - Ok(max_to_copy) + Ok(out) } } @@ -912,8 +1016,9 @@ impl NotificationHandler for ServerImpl { mod idl { use super::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostBootfailReadOutput, HostInfoReadError, HostInfoWriteOutput, - HostStartupOptions, MacAddressBlock, OxideIdentity, ereport_messages, + HostBootfailReadOutput, HostInfoReadError, HostInfoRequest, + HostInfoWriteOutput, HostPanicReadOutput, HostStartupOptions, + MacAddressBlock, OxideIdentity, ereport_messages, }; include!(concat!(env!("OUT_DIR"), "/server_stub.rs"));