From d6b6e7b4b9ac8297b583817593253dfe245d6c92 Mon Sep 17 00:00:00 2001 From: Techcable Date: Sun, 21 Dec 2025 09:38:09 -0700 Subject: [PATCH 1/3] Use a single slowpath for Brc::new On may M1 Macbook, this shrinks the size of the code from 208B to 172B, but actually reduces performance on M1 Macbook by 2%. On my Lenovo Laptop, there is no effect on performance, but reduces code size from 174B to 163B. WARNING: This patch is incomplete, as it breaks the encapsulation of runtime::threads. --- src/runtime.rs | 24 +++++++----------------- src/runtime/threads.rs | 28 ++++++++++++++++++++++++++++ src/strong.rs | 19 +++++++++---------- 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/src/runtime.rs b/src/runtime.rs index 7b5f3da..b71b5c7 100644 --- a/src/runtime.rs +++ b/src/runtime.rs @@ -1,6 +1,4 @@ -use crate::runtime::threads::{ - LocalThreadAccessError, LocalThreadState, SharedThreadInfo, ShortThreadId, -}; +use crate::runtime::threads::{LocalThreadAccessError, SharedThreadInfo, ShortThreadId}; use arbitrary_int::prelude::*; use cfg_if::cfg_if; use core::ffi::c_void; @@ -13,6 +11,9 @@ use core::sync::atomic::Ordering; mod threads; +// TODO: Eliminate this? +pub use threads::LocalThreadState; + /// An error returned by [`Brc::biased_count`], /// either caused by being the wrong thread or not being biased at all. /// @@ -78,24 +79,13 @@ impl RawBrcHeader { /// /// # Safety /// The resulting header must be pinned in-memory before it is ever used. + /// The thread id must be correct for this thread, or be `None`. /// /// # Panics /// This function will never unwind, although it may abort. #[inline] - pub unsafe fn init() -> Self { - let this_id = match LocalThreadState::existing_short_id() { - Ok(short_id) => Some(short_id), - Err(LocalThreadAccessError::Dead | LocalThreadAccessError::IdOverflow(_)) => { - // in this case, the local state was already initialized, - // but we cannot participate in biased reference counting - None - } - Err(LocalThreadAccessError::Uninitialized) => { - // Need to actually initialize the thread state - LocalThreadState::init_tid() - } - }; - match this_id { + pub unsafe fn init_with(this_thread_id: Option) -> Self { + match this_thread_id { None => RawBrcHeader { shared_word: AtomicUsize::new( SharedWord { diff --git a/src/runtime/threads.rs b/src/runtime/threads.rs index a6c7fa1..f38241c 100644 --- a/src/runtime/threads.rs +++ b/src/runtime/threads.rs @@ -256,6 +256,34 @@ impl LocalThreadState { nounwind::abort_unwind(|| LocalThreadState::with_current(LocalThreadState::short_id).ok()) } + /// Either call [`collect`] or initialize the thread state, + /// depending on the previous thread state. + /// + /// This function is a performance experiment. + #[inline] + pub fn collect_or_init_tid() -> Option { + let (current_status, thread_id) = + THIS_THREAD_STATE_FAST.with(|state| (state.status.get(), state.short_id.get())); + match current_status { + LocalThreadStatus::DeadOrDying => None, + LocalThreadStatus::Uninit => Self::collect_or_init_tid_slow(), + LocalThreadStatus::Active if Self::currently_needs_collect() => { + Self::collect_or_init_tid_slow() + } + LocalThreadStatus::Active => thread_id, + } + } + #[inline(never)] + #[cold] + fn collect_or_init_tid_slow() -> Option { + if Self::currently_needs_collect() { + crate::collect(); + THIS_THREAD_STATE_FAST.with(|state| state.short_id.get()) + } else { + Self::init_tid() + } + } + /// Access the current thread info inside the specified closure. /// /// # Safety diff --git a/src/strong.rs b/src/strong.rs index 1417ce0..2777f0f 100644 --- a/src/strong.rs +++ b/src/strong.rs @@ -95,11 +95,7 @@ impl Brc { /// using a particular allocator. #[inline] pub fn new_in(value: T, alloc: A) -> Self { - // There is no advantage to waiting for BrcRawHeader::init to initialize the thread state. - // This is because if the thread state is uninitialized, - // the queue is empty and collection would be a no-op anyway. - #[cfg(not(biasedrc_no_implicit_collect))] - collect(); + let thread_id = runtime::LocalThreadState::collect_or_init_tid(); // This function used to be implemented as Self::new_with(|| value). // While correct, this caused code bloat and was noticeably slower than Arc::new. // @@ -125,7 +121,8 @@ impl Brc { // If we are wrong, we just leak the newly allocated memory let header = BrcHeader { // SAFETY: Pinned in memory immediately after construction. - strong: unsafe { RawBrcHeader::init() }, + // Thread id is correct for this thread. + strong: unsafe { RawBrcHeader::init_with(thread_id) }, weak_count: AtomicU32::new(1), alloc: ManuallyDrop::new(alloc), }; @@ -299,8 +296,7 @@ impl Brc { func: impl FnOnce(*mut T), alloc: A, ) -> Self { - #[cfg(not(biasedrc_no_implicit_collect))] - collect(); + let this_thread_id = runtime::LocalThreadState::collect_or_init_tid(); let layout = LayoutInfo::::new_or_panic(layout); struct CleanupGuard { ptr: NonNull, @@ -330,9 +326,12 @@ impl Brc { } // SAFETY: Memory is newly allocated so it is known to be valid // The RawBrcHeader is pinned immediately after it is created - // we just verified above that the field offset is zero + // and we just verified above that the field offset is zero. + // We also know that `this_thread_id` is valid unsafe { - allocated.cast::().write(RawBrcHeader::init()); + allocated + .cast::() + .write(RawBrcHeader::init_with(this_thread_id)); } // SAFETY: Newly allocated memory is valid unsafe { From 3f030788ea3588e44dcbd450bef55fb1748a1936 Mon Sep 17 00:00:00 2001 From: Techcable Date: Sun, 21 Dec 2025 09:50:45 -0700 Subject: [PATCH 2/3] Consolidate slow-paths for Brc::new Doesn't seem to help code size or performance. The old patch at least helped with the former. --- src/runtime/threads.rs | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/runtime/threads.rs b/src/runtime/threads.rs index f38241c..fd6ea77 100644 --- a/src/runtime/threads.rs +++ b/src/runtime/threads.rs @@ -266,21 +266,13 @@ impl LocalThreadState { THIS_THREAD_STATE_FAST.with(|state| (state.status.get(), state.short_id.get())); match current_status { LocalThreadStatus::DeadOrDying => None, - LocalThreadStatus::Uninit => Self::collect_or_init_tid_slow(), - LocalThreadStatus::Active if Self::currently_needs_collect() => { - Self::collect_or_init_tid_slow() + LocalThreadStatus::Uninit => Self::init_tid(), + LocalThreadStatus::Active => { + if Self::currently_needs_collect() { + Self::collect_slow(); + } + thread_id } - LocalThreadStatus::Active => thread_id, - } - } - #[inline(never)] - #[cold] - fn collect_or_init_tid_slow() -> Option { - if Self::currently_needs_collect() { - crate::collect(); - THIS_THREAD_STATE_FAST.with(|state| state.short_id.get()) - } else { - Self::init_tid() } } From eab44b9784ba4a26decb7f33cc06d6b846e214fd Mon Sep 17 00:00:00 2001 From: Techcable Date: Sun, 21 Dec 2025 09:51:15 -0700 Subject: [PATCH 3/3] Revert "Consolidate slow-paths for Brc::new" This reverts commit 3f030788ea3588e44dcbd450bef55fb1748a1936. Did not improve code size as expected, still seemed to harm performance. --- src/runtime/threads.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/runtime/threads.rs b/src/runtime/threads.rs index fd6ea77..f38241c 100644 --- a/src/runtime/threads.rs +++ b/src/runtime/threads.rs @@ -266,13 +266,21 @@ impl LocalThreadState { THIS_THREAD_STATE_FAST.with(|state| (state.status.get(), state.short_id.get())); match current_status { LocalThreadStatus::DeadOrDying => None, - LocalThreadStatus::Uninit => Self::init_tid(), - LocalThreadStatus::Active => { - if Self::currently_needs_collect() { - Self::collect_slow(); - } - thread_id + LocalThreadStatus::Uninit => Self::collect_or_init_tid_slow(), + LocalThreadStatus::Active if Self::currently_needs_collect() => { + Self::collect_or_init_tid_slow() } + LocalThreadStatus::Active => thread_id, + } + } + #[inline(never)] + #[cold] + fn collect_or_init_tid_slow() -> Option { + if Self::currently_needs_collect() { + crate::collect(); + THIS_THREAD_STATE_FAST.with(|state| state.short_id.get()) + } else { + Self::init_tid() } }