From a68c7e1b5bec35479e98472e1f0964425de41677 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 22 May 2026 13:54:40 -0700 Subject: [PATCH 1/3] logging: do not panic in the event of a failure to log a message This is a quite sad situation. Currently, the logger configuration produced by `dropshot::ConfigLogging` will panic whenever an error writing a log line occurs. This crashes the entire process, which is often very much not what you want to have happen. Therefore, this branch changes the behavior to ignore errors returned here, resulting in the log line being silently eaten. This is *also* a behavior that I will be the first to admit is quite sad, but I believe it to be a less sad default than crashing the entire process. As an example of why, consider the Oxide rack's sled-agent. The sled-agent will currently crash when it attempts to log something and the root filesystem to which it attempts to log is full, as described in oxidecomputer/omicron#4354. This is quite bad, as if we were to implement some way of freeing up additional storage space in such a situation, it is likely the sled-agent that would be responsible for doing so! In an ideal world, we might wish for different behavior based on the particular error that occurred. We may wish to panic on errors to serialize a log line (as that would be indicative of a programmer error), while we may wish to not panic on I/O errors. Sadly, slog's current interfaces make this challenging, as all errors from the loggers we use, are presently coerced to `std::io::Error`, making it difficult to distinguish between I/O and serialization errors in a non-flaky way. In the future, we might want to change the behavior from *silently* dropping log lines to instead recording when we have done so, perhaps by maintaining a count of logging errors. This could be broken down by error kinds, and track the timestamp of the last time a log message was lost. Reporting could report these error counts through other means, such as timeseries metrics, would allow the operator of the service to at least *know* that their log is incomplete. We could even imagine having a thingy that tracks if we have lost log messages, and once a log message is written successfully, tries to log a "hey, by the way, we also dropped however many log lines over the last however long!". We might also want to make the panciking behavior configurable. This is sadly somewhat more annoying than it ought to be, as `slog::Drain::fuse()` and `slog::Drain::ignore_res()` change the _type_ of the drain, requiring duplicate code paths that construct almost-but-not-entirely identical loggers. But, this would let us have a config which, say, panics in tests but does not panic in production, which seems like a reasonable thing to want. Nonetheless, this commit takes the shortest path to not panicking, and just turns all the `fuse()`s into `ignore_res()`es. I think the present behavior is bad enough in production that the quick fix feels fairly justified. We should consider making this nicer in the future though. --- dropshot/src/logging.rs | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/dropshot/src/logging.rs b/dropshot/src/logging.rs index 110fdad67..c75bb9892 100644 --- a/dropshot/src/logging.rs +++ b/dropshot/src/logging.rs @@ -76,7 +76,7 @@ impl ConfigLogging { ConfigLogging::StderrTerminal { level } => { let decorator = slog_term::TermDecorator::new().build(); let drain = - slog_term::FullFormat::new(decorator).build().fuse(); + slog_term::FullFormat::new(decorator).build().ignore_res(); Ok(async_root_logger(level, drain)) } @@ -149,9 +149,27 @@ where T: slog::Drain + Send + 'static, ::Err: std::fmt::Debug, { - let level_drain = slog::LevelFilter(drain, Level::from(level)).fuse(); + let level_drain = slog::LevelFilter(drain, Level::from(level)) + .ignore_res(); let async_drain = - slog_async::Async::new(level_drain).chan_size(1024).build().fuse(); + slog_async::Async::new(level_drain).chan_size(1024).build() + // Use `ignore_res()` here rather than `fuse`, to ensure that we do + // not panic in the event of a potentially-transient I/O error, such + // as a full disk. Although it is quite sad to silently eat a log + // message, it is much sadder to crash the entire process, + // particularly when the process is what is ultimately responsible + // for remediating the circumstances that causes logging to fail. + // + // In an ideal world, we might like some way of tracking a count of + // how many log messages have been dropped, and for what reason. + // + // We may also wish to configure slog to panic if it attempts to log + // something that it cannot serialze, but not to panic if it + // encounters an I/O error. Sadly, slog's interfaces for error + // handling make this difficult, as everything is getting coerced to + // `std::io::Error` anyway, even if it is actually a serialization + // error. Sigh. + .ignore_res(); slog::Logger::root(async_drain, o!()) } @@ -159,7 +177,7 @@ fn log_drain_for_file( open_options: &OpenOptions, path: &Path, log_name: String, -) -> Result>>, io::Error> { +) -> Result>>, io::Error> { if let Some(parent) = path.parent() { std::fs::create_dir_all(parent)?; } @@ -174,7 +192,10 @@ fn log_drain_for_file( // TODO-cleanup let log_name_box = Box::new(log_name); let log_name_leaked = Box::leak(log_name_box); - Ok(slog_bunyan::with_name(log_name_leaked, file).build().fuse()) + Ok(slog_bunyan::with_name(log_name_leaked, file).build() + // As I discussed above, we wish to avoid panicking in the event of an + // error whilst writing a log message to the file. + .ignore_res()) } #[cfg(test)] From a64ff7d78a8c140a2dbe7303b9a4f6ca33732a67 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 22 May 2026 14:00:31 -0700 Subject: [PATCH 2/3] I DIDN'T MEAN TO CLICK THE GODDAMN COPILOT THING BUT IT FOUDN SOME TYPOS Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- dropshot/src/logging.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dropshot/src/logging.rs b/dropshot/src/logging.rs index c75bb9892..9e476def7 100644 --- a/dropshot/src/logging.rs +++ b/dropshot/src/logging.rs @@ -158,13 +158,13 @@ where // as a full disk. Although it is quite sad to silently eat a log // message, it is much sadder to crash the entire process, // particularly when the process is what is ultimately responsible - // for remediating the circumstances that causes logging to fail. + // for remediating the circumstances that cause logging to fail. // // In an ideal world, we might like some way of tracking a count of // how many log messages have been dropped, and for what reason. // // We may also wish to configure slog to panic if it attempts to log - // something that it cannot serialze, but not to panic if it + // something that it cannot serialize, but not to panic if it // encounters an I/O error. Sadly, slog's interfaces for error // handling make this difficult, as everything is getting coerced to // `std::io::Error` anyway, even if it is actually a serialization From ee5793b9fe5792825199b6cf3bbee6f850abb0b2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 22 May 2026 14:01:36 -0700 Subject: [PATCH 3/3] rustfmt --- dropshot/src/logging.rs | 48 ++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/dropshot/src/logging.rs b/dropshot/src/logging.rs index 9e476def7..836d3b2c9 100644 --- a/dropshot/src/logging.rs +++ b/dropshot/src/logging.rs @@ -149,27 +149,27 @@ where T: slog::Drain + Send + 'static, ::Err: std::fmt::Debug, { - let level_drain = slog::LevelFilter(drain, Level::from(level)) + let level_drain = slog::LevelFilter(drain, Level::from(level)).ignore_res(); + let async_drain = slog_async::Async::new(level_drain) + .chan_size(1024) + .build() + // Use `ignore_res()` here rather than `fuse`, to ensure that we do + // not panic in the event of a potentially-transient I/O error, such + // as a full disk. Although it is quite sad to silently eat a log + // message, it is much sadder to crash the entire process, + // particularly when the process is what is ultimately responsible + // for remediating the circumstances that cause logging to fail. + // + // In an ideal world, we might like some way of tracking a count of + // how many log messages have been dropped, and for what reason. + // + // We may also wish to configure slog to panic if it attempts to log + // something that it cannot serialize, but not to panic if it + // encounters an I/O error. Sadly, slog's interfaces for error + // handling make this difficult, as everything is getting coerced to + // `std::io::Error` anyway, even if it is actually a serialization + // error. Sigh. .ignore_res(); - let async_drain = - slog_async::Async::new(level_drain).chan_size(1024).build() - // Use `ignore_res()` here rather than `fuse`, to ensure that we do - // not panic in the event of a potentially-transient I/O error, such - // as a full disk. Although it is quite sad to silently eat a log - // message, it is much sadder to crash the entire process, - // particularly when the process is what is ultimately responsible - // for remediating the circumstances that cause logging to fail. - // - // In an ideal world, we might like some way of tracking a count of - // how many log messages have been dropped, and for what reason. - // - // We may also wish to configure slog to panic if it attempts to log - // something that it cannot serialize, but not to panic if it - // encounters an I/O error. Sadly, slog's interfaces for error - // handling make this difficult, as everything is getting coerced to - // `std::io::Error` anyway, even if it is actually a serialization - // error. Sigh. - .ignore_res(); slog::Logger::root(async_drain, o!()) } @@ -177,7 +177,10 @@ fn log_drain_for_file( open_options: &OpenOptions, path: &Path, log_name: String, -) -> Result>>, io::Error> { +) -> Result< + slog::IgnoreResult>>, + io::Error, +> { if let Some(parent) = path.parent() { std::fs::create_dir_all(parent)?; } @@ -192,7 +195,8 @@ fn log_drain_for_file( // TODO-cleanup let log_name_box = Box::new(log_name); let log_name_leaked = Box::leak(log_name_box); - Ok(slog_bunyan::with_name(log_name_leaked, file).build() + Ok(slog_bunyan::with_name(log_name_leaked, file) + .build() // As I discussed above, we wish to avoid panicking in the event of an // error whilst writing a log message to the file. .ignore_res())