diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h
index bacd58fb0e..3b745fdd28 100644
--- a/src/TiledArray/array_impl.h
+++ b/src/TiledArray/array_impl.h
@@ -27,11 +27,14 @@
 #define TILEDARRAY_ARRAY_IMPL_H__INCLUDED
 
 #include <TiledArray/distributed_storage.h>
+#include <TiledArray/tensor/arena_kernels.h>
 #include <TiledArray/tensor_impl.h>
 #include <TiledArray/transform_iterator.h>
 #include <TiledArray/type_traits.h>
 #include <TiledArray/util/function.h>
 
+#include <map>
+
 namespace TiledArray {
 namespace detail {
 
@@ -986,48 +989,104 @@ std::shared_ptr<ArrayImpl<Tile, Policy>> make_with_new_trange(
           Policy::default_pmap(world, target_trange.tiles_range().volume())),
       Array::lazy_deleter);
   auto& target_array = *target_array_sptr;
-  target_array.init_tiles([value = new_value_fill](const Range& range) {
-    return typename Array::value_type(range, value);
-  });
-  target_array.world().gop.fence();
-
-  // loop over local tile and sends its contributions to the targets
-  {
-    const auto e = source_array.cend();
-    auto& target_tiles_range = target_trange.tiles_range();
-    for (auto it = source_array.cbegin(); it != e; ++it) {
-      const auto& source_tile = *it;
-      auto source_tile_idx = it.index();
 
-      // make range for iterating over all possible target tile idx combinations
-      TA::Index target_tile_ord_extent_range(rank);
-      for (auto d = 0; d != rank; ++d) {
-        target_tile_ord_extent_range[d] =
-            all_target_tiles[d][source_tile_idx[d]].size();
+  if constexpr (detail::is_tensor_of_tensor_v<Tile> &&
+                is_arena_tensor_v<typename Tile::value_type>) {
+    // Arena tensor-of-tensor: a ToT tile's inner cells are non-owning views
+    // into that tile's own arena slab. The generic null-init + write_tile_block
+    // scatter (the `else` branch) would rebind the target's null inner cells to
+    // the *source* tiles' slabs, leaving them dangling once the source array is
+    // destroyed. Instead build each local target tile directly (deep copy) by
+    // pulling the source cells: a retile preserves the element space, so the
+    // target cell at global outer element `e` takes its inner range and data
+    // from the source cell at `e` (elements outside the source range, e.g. a
+    // retile that grows the element range, yield null cells).
+    using inner_range_type = typename Tile::value_type::range_type;
+    const auto& source_elements = source_array.trange().elements_range();
+    std::map<std::size_t, Tile> src_tile_cache;
+    auto source_cell_at =
+        [&](const auto& e) -> const typename Tile::value_type* {
+      if (!source_elements.includes(e)) return nullptr;
+      const auto src_tile_idx = source_array.trange().element_to_tile(e);
+      const auto src_ord =
+          source_array.trange().tiles_range().ordinal(src_tile_idx);
+      auto it = src_tile_cache.find(src_ord);
+      if (it == src_tile_cache.end()) {
+        it = src_tile_cache
+                 .emplace(src_ord, source_array.is_zero(src_tile_idx)
+                                       ? Tile{}
+                                       : source_array.get(src_tile_idx).get())
+                 .first;
       }
-
-      // loop over every target tile combination
-      TA::Range target_tile_ord_extent(target_tile_ord_extent_range);
-      for (auto& target_tile_ord : target_tile_ord_extent) {
-        TA::Index target_tile_idx(rank);
-        container::svector<TA::Range1> target_tile_rngs1(rank);
+      const Tile& st = it->second;
+      if (st.empty()) return nullptr;
+      return &st(e);
+    };
+    for (const auto target_ord : *target_array.pmap()) {
+      if (target_array.is_zero(target_ord)) continue;
+      Tile tile = make_nested_tile<Tile>(
+          target_trange.make_tile_range(target_ord),
+          [&](const auto& e) -> inner_range_type {
+            const auto* sc = source_cell_at(e);
+            return (sc && !sc->empty()) ? sc->range() : inner_range_type{};
+          },
+          [&](auto& cell, const auto& e) {
+            const auto* sc = source_cell_at(e);
+            if (sc && !sc->empty()) {
+              const auto* s = sc->data();
+              auto* d = cell.data();
+              for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p];
+            }
+          });
+      target_array.set(target_ord, std::move(tile));
+    }
+    target_array.world().gop.fence();
+  } else {
+    target_array.init_tiles([value = new_value_fill](const Range& range) {
+      return typename Array::value_type(range, value);
+    });
+    target_array.world().gop.fence();
+
+    // loop over local tile and sends its contributions to the targets
+    {
+      const auto e = source_array.cend();
+      auto& target_tiles_range = target_trange.tiles_range();
+      for (auto it = source_array.cbegin(); it != e; ++it) {
+        const auto& source_tile = *it;
+        auto source_tile_idx = it.index();
+
+        // make range for iterating over all possible target tile idx
+        // combinations
+        TA::Index target_tile_ord_extent_range(rank);
         for (auto d = 0; d != rank; ++d) {
-          std::tie(target_tile_idx[d], target_tile_rngs1[d]) =
-              all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]];
+          target_tile_ord_extent_range[d] =
+              all_target_tiles[d][source_tile_idx[d]].size();
+        }
+
+        // loop over every target tile combination
+        TA::Range target_tile_ord_extent(target_tile_ord_extent_range);
+        for (auto& target_tile_ord : target_tile_ord_extent) {
+          TA::Index target_tile_idx(rank);
+          container::svector<TA::Range1> target_tile_rngs1(rank);
+          for (auto d = 0; d != rank; ++d) {
+            std::tie(target_tile_idx[d], target_tile_rngs1[d]) =
+                all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]];
+          }
+          TA_ASSERT(source_tile.future().probe());
+          Tile target_tile_contribution(
+              source_tile.get().block(target_tile_rngs1));
+          auto target_tile_idx_ord =
+              target_tiles_range.ordinal(target_tile_idx);
+          auto target_proc = target_array.pmap()->owner(target_tile_idx_ord);
+          world.taskq.add(target_proc, &write_tile_block<Tile, Policy>,
+                          target_array.id(), target_tile_idx_ord,
+                          target_tile_contribution);
         }
-        TA_ASSERT(source_tile.future().probe());
-        Tile target_tile_contribution(
-            source_tile.get().block(target_tile_rngs1));
-        auto target_tile_idx_ord = target_tiles_range.ordinal(target_tile_idx);
-        auto target_proc = target_array.pmap()->owner(target_tile_idx_ord);
-        world.taskq.add(target_proc, &write_tile_block<Tile, Policy>,
-                        target_array.id(), target_tile_idx_ord,
-                        target_tile_contribution);
       }
     }
+    // data is mutated in place, so must wait for all tasks to complete
+    target_array.world().gop.fence();
   }
-  // data is mutated in place, so must wait for all tasks to complete
-  target_array.world().gop.fence();
   // WARNING!! need to truncate in DistArray ctor
 
   return target_array_sptr;
diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h
index c3cafb5605..6b994e17c0 100644
--- a/src/TiledArray/dist_array.h
+++ b/src/TiledArray/dist_array.h
@@ -28,6 +28,7 @@
 #include "TiledArray/pmap/replicated_pmap.h"
 #include "TiledArray/policies/dense_policy.h"
 #include "TiledArray/replicator.h"
+#include "TiledArray/tensor/arena_kernels.h"
 #include "TiledArray/tile_interface/cast.h"
 #include "TiledArray/util/annotation.h"
 #include "TiledArray/util/initializer_list.h"
@@ -253,6 +254,35 @@ class DistArray : public madness::archive::ParallelSerializableObject {
                 std::shared_ptr<const pmap_interface>())
       : pimpl_(init(get_default_world(), trange, shape, pmap)) {}
 
+  /// Tensor-of-tensors array constructor
+
+  /// Constructs a tensor-of-tensors array in fully-shaped state: every inner
+  /// cell of every local tile is allocated (its range taken from
+  /// \p inner_range_fn, element storage zero-initialized), so the array
+  /// immediately satisfies the ToT validity invariant and is ready for
+  /// in-place fill (\c fill, \c foreach_inplace, element writes, ...).
+  /// Enabled only when \c Tile is a tensor-of-tensors.
+  /// \tparam InnerRangeFn callable type
+  /// \param world The world where the array will live.
+  /// \param trange The tiled range object that defines the array tiling.
+  /// \param inner_range_fn callable mapping a global outer element index to
+  ///        that inner cell's range; a zero-volume range yields a null cell.
+  /// \param pmap The tile index -> process map
+  template <
+      typename InnerRangeFn,
+      typename = std::enable_if_t<
+          detail::is_tensor_of_tensor_v<Tile> &&
+          !std::is_convertible_v<std::decay_t<InnerRangeFn>, shape_type> &&
+          !std::is_convertible_v<std::decay_t<InnerRangeFn>,
+                                 std::shared_ptr<const pmap_interface>>>>
+  DistArray(World& world, const trange_type& trange,
+            InnerRangeFn&& inner_range_fn,
+            const std::shared_ptr<const pmap_interface>& pmap = {})
+      : DistArray(world, trange, pmap) {
+    init_tiles_nested(std::forward<InnerRangeFn>(inner_range_fn),
+                      detail::nested_fill_noop{});
+  }
+
   /// \name Initializer list constructors
   /// \brief Creates a new tensor containing the elements in the provided
   ///         `std::initializer_list`.
@@ -779,7 +809,23 @@ class DistArray : public madness::archive::ParallelSerializableObject {
                                   detail::is_input_iterator<InIter>::value>>
   void set(const Index& i, InIter first) {
     check_index(i);
-    pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), first));
+    if constexpr (detail::is_tensor_of_tensor_v<value_type> &&
+                  is_arena_tensor_v<element_type>) {
+      // arena ToT: the iterated inner tiles carry the ranges needed to size
+      // the slab; buffer them (the iterator is single-pass) and build.
+      const auto outer_range = pimpl_->trange().make_tile_range(i);
+      using SrcTile = std::decay_t<decltype(*first)>;
+      std::vector<SrcTile> buf;
+      buf.reserve(outer_range.volume());
+      for (std::size_t k = 0; k < outer_range.volume(); ++k, ++first)
+        buf.emplace_back(*first);
+      pimpl_->set(i, make_arena_nested_tile(
+                         outer_range, [&buf](std::size_t k) -> const SrcTile& {
+                           return buf[k];
+                         }));
+    } else {
+      pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), first));
+    }
   }
 
   /// Set a tile and fill it using a sequence
@@ -828,7 +874,20 @@ class DistArray : public madness::archive::ParallelSerializableObject {
             typename = enable_if_is_integral_or_integral_range<Index>>
   void set(const Index& i, const element_type& value = element_type()) {
     check_index(i);
-    pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), value));
+    if constexpr (detail::is_tensor_of_tensor_v<value_type> &&
+                  is_arena_tensor_v<element_type>) {
+      // arena ToT: every inner cell takes `value`'s (initialized) range and a
+      // deep copy of its data -- build the slab-backed tile from that range.
+      TA_ASSERT(!value.empty() &&
+                "DistArray::set: a null inner tile has no range to size cells");
+      pimpl_->set(i, make_arena_nested_tile(
+                         pimpl_->trange().make_tile_range(i),
+                         [&value](std::size_t) -> const element_type& {
+                           return value;
+                         }));
+    } else {
+      pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), value));
+    }
   }
 
   /// Set every element of a tile to a specified value
@@ -908,12 +967,33 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   ///                              guarantee.
   /// \throw TiledArray::Exception if skip_set is false and a local tile is
   ///                              already set. Weak throw guarantee.
-  template <Fence fence = Fence::No>
-  std::int64_t fill_local(const element_type& value = element_type(),
-                          bool skip_set = false) {
-    return init_tiles<HostExecutor::Default, fence>(
-        [value](const range_type& range) { return value_type(range, value); },
-        skip_set);
+  ///
+  /// \tparam V the value type; defaults to \c element_type but may be any
+  ///         type assignable to \c element_type& -- a freestanding
+  ///         \c ArenaTensor cannot be minted, so an arena-backed ToT is
+  ///         filled by passing e.g. an owning \c TA::Tensor.
+  /// \note For an *arena-backed* tensor-of-tensors tile type this is an
+  /// in-place mutator over an already-shaped array (constructed via
+  /// \c init_tiles_nested or the ToT range_fn ctor): \p value is deep-copied
+  /// into every (bound) inner cell, so it must match each cell's volume.
+  template <Fence fence = Fence::No, typename V = element_type,
+            typename =
+                std::enable_if_t<std::is_assignable_v<element_type&, const V&>>>
+  std::int64_t fill_local(const V& value = V(), bool skip_set = false) {
+    if constexpr (detail::is_tensor_of_tensor_v<value_type> &&
+                  is_arena_tensor_v<element_type>) {
+      return for_each_local_tile_inplace<fence>([value](value_type& outer) {
+        for (std::size_t o = 0; o < outer.size(); ++o) {
+          auto& cell = outer.data()[o];
+          if (cell.empty()) continue;  // skip deliberately-null cells
+          cell = value;                // deep copy into the bound arena cell
+        }
+      });
+    } else {
+      return init_tiles<HostExecutor::Default, fence>(
+          [value](const range_type& range) { return value_type(range, value); },
+          skip_set);
+    }
   }
 
   /// Fill all local tiles with the specified value
@@ -930,11 +1010,16 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   ///                              guarantee.
   /// \throw TiledArray::Exception if skip_set is false and a local tile is
   ///                              already set. Weak throw guarantee.
-  template <Fence fence = Fence::No>
-  std::int64_t fill(const element_type& value = numeric_type(),
-                    bool skip_set = false) {
-    // for sparse arrays filled with zero, replace with an empty array
-    if constexpr (!is_dense_v<Policy>) {
+  template <Fence fence = Fence::No, typename V = element_type,
+            typename =
+                std::enable_if_t<std::is_assignable_v<element_type&, const V&>>>
+  std::int64_t fill(const V& value = V(), bool skip_set = false) {
+    // for sparse arrays filled with zero, replace with an empty array;
+    // an arena-backed ToT is shaped before fill (fill_local mutates in place),
+    // and its inner view tiles have no zero-comparison -- skip for those
+    if constexpr (!is_dense_v<Policy> &&
+                  !(detail::is_tensor_of_tensor_v<value_type> &&
+                    is_arena_tensor_v<element_type>)) {
       if (value == element_type()) {
         *this = DistArray(
             world(), trange(),
@@ -957,7 +1042,9 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// \tparam fence If Fence::No, the operation will return early,
   ///         before the tasks have completed
   /// \tparam T The type of random value to generate. Defaults to
-  ///           element_type.
+  ///           numeric_type (the scalar type), so this works for
+  ///           tensor-of-tensors arrays, where it fills every inner scalar
+  ///           in place over an already-shaped array (see \c init_elements).
   /// \param[in] skip_set If false, will throw if any tiles are already set
   /// \return the total number of tiles that have been (or will be) initialized
   /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong
@@ -965,11 +1052,25 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// \throw TiledArray::Exception if skip_set is false and a local tile is
   ///                              already initialized. Weak throw guarantee.
   template <HostExecutor Exec = HostExecutor::Default,
-            typename T = element_type, Fence fence = Fence::No,
+            typename T = numeric_type, Fence fence = Fence::No,
             typename = detail::enable_if_can_make_random_t<T>>
   std::int64_t fill_random(bool skip_set = false) {
-    return init_elements<Exec, fence>(
-        [](const auto&) { return detail::MakeRandom<T>::generate_value(); });
+    if constexpr (detail::is_tensor_of_tensor_v<value_type>) {
+      // in-place over an already-shaped ToT array (plain or arena-backed):
+      // overwrite every inner scalar, leaving inner ranges untouched
+      return for_each_local_tile_inplace<fence>([](value_type& outer) {
+        for (std::size_t o = 0; o < outer.size(); ++o) {
+          auto& cell = outer.data()[o];
+          if (cell.empty()) continue;
+          const std::size_t n = cell.size();
+          for (std::size_t i = 0; i < n; ++i)
+            cell.data()[i] = detail::MakeRandom<T>::generate_value();
+        }
+      });
+    } else {
+      return init_elements<Exec, fence>(
+          [](const auto&) { return detail::MakeRandom<T>::generate_value(); });
+    }
   }
 
   /// Initialize (local) tiles with a user provided functor
@@ -1042,20 +1143,94 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// \throw TiledArray::Exception if skip_set is false and a local, non-zero
   ///                              tile is already initialized. Weak throw
   ///                              guarantee.
+  ///
+  /// \note \p op must return a freestanding value assignable to
+  /// \c element_type&. For an *arena-backed* tensor-of-tensors tile type the
+  /// inner cell is a non-owning view that cannot be minted standalone, so
+  /// \p op returns an owning tensor (e.g. \c TA::Tensor): each outer tile
+  /// collects its \p op outputs, then allocates one arena slab sized to them
+  /// (via \c detail::make_nested_tile) and deep-copies the outputs into the
+  /// bound inner cells.
   template <HostExecutor Exec = HostExecutor::Default, Fence fence = Fence::No,
             typename Op>
   std::int64_t init_elements(Op&& op, bool skip_set = false) {
     auto op_shared_handle = make_op_shared_handle(std::forward<Op>(op));
-    return init_tiles<Exec, fence>(
-        [op = std::move(op_shared_handle)](
-            const TiledArray::Range& range) -> value_type {
-          // Initialize the tile with the given range object
-          Tile tile(range);
+    if constexpr (detail::is_tensor_of_tensor_v<value_type> &&
+                  is_arena_tensor_v<element_type>) {
+      return init_tiles<Exec, fence>(
+          [op = std::move(op_shared_handle)](
+              const TiledArray::Range& outer_range) -> value_type {
+            using R = std::decay_t<decltype(op(outer_range.idx(0)))>;
+            static_assert(
+                std::is_assignable_v<element_type&, const R&>,
+                "DistArray::init_elements: op must return a freestanding "
+                "tensor assignable to the inner tile type");
+            // pass 1: collect op's freestanding inner tensors; pass 2:
+            // make_arena_nested_tile sizes the slab and deep-copies them in
+            std::vector<R> collected;
+            collected.reserve(outer_range.volume());
+            for (std::size_t o = 0; o < outer_range.volume(); ++o)
+              collected.emplace_back(op(outer_range.idx(o)));
+            return make_arena_nested_tile(
+                outer_range, [&collected](std::size_t k) -> const R& {
+                  return collected[k];
+                });
+          },
+          skip_set);
+    } else {
+      return init_tiles<Exec, fence>(
+          [op = std::move(op_shared_handle)](
+              const TiledArray::Range& range) -> value_type {
+            // Initialize the tile with the given range object
+            Tile tile(range);
+
+            // Initialize tile elements
+            for (auto& idx : range) tile[idx] = op(idx);
+
+            return tile;
+          },
+          skip_set);
+    }
+  }
 
-          // Initialize tile elements
-          for (auto& idx : range) tile[idx] = op(idx);
+  /// Initialize tensor-of-tensors tiles two-pass with user-provided functors
 
-          return tile;
+  /// A whole-tile constructor (like \c init_tiles), specialized for
+  /// tensor-of-tensors \c Tile s: each local tile is built via
+  /// \c detail::make_nested_tile -- \p inner_range_fn sizes every inner cell,
+  /// \p inner_fill_fn fills it -- so arena-backed inner cells are allocated in
+  /// one slab per tile. The work is done in parallel, so both functors must be
+  /// thread safe. The expected signatures are:
+  /// \code
+  /// inner_range_type inner_range_fn(const Index& outer_element_index)
+  /// void inner_fill_fn(inner_tile& cell, const Index& outer_element_index)
+  /// \endcode
+  /// where \c outer_element_index is a global element index. A zero-volume
+  /// inner range yields a deliberately-null inner cell, which \p inner_fill_fn
+  /// is not invoked on.
+  /// \tparam InnerRangeFn callable producing each inner cell's range
+  /// \tparam InnerFillFn callable filling each non-null inner cell
+  /// \param[in] inner_range_fn maps a global outer element index to an inner
+  ///            range
+  /// \param[in] inner_fill_fn fills a non-null inner cell
+  /// \param[in] skip_set If false, will throw if any tiles are already set
+  /// \return the total number of tiles that have been (or will be) initialized
+  template <HostExecutor Exec = HostExecutor::Default, Fence fence = Fence::No,
+            typename InnerRangeFn, typename InnerFillFn,
+            typename V = value_type,
+            typename = std::enable_if_t<detail::is_tensor_of_tensor_v<V>>>
+  std::int64_t init_tiles_nested(InnerRangeFn&& inner_range_fn,
+                                 InnerFillFn&& inner_fill_fn,
+                                 bool skip_set = false) {
+    auto range_fn =
+        make_op_shared_handle(std::forward<InnerRangeFn>(inner_range_fn));
+    auto fill_fn =
+        make_op_shared_handle(std::forward<InnerFillFn>(inner_fill_fn));
+    return init_tiles<Exec, fence>(
+        [range_fn = std::move(range_fn), fill_fn = std::move(fill_fn)](
+            const TiledArray::Range& outer_tile_range) -> value_type {
+          return detail::make_nested_tile<value_type>(outer_tile_range,
+                                                      range_fn, fill_fn);
         },
         skip_set);
   }
@@ -1705,6 +1880,78 @@ class DistArray : public madness::archive::ParallelSerializableObject {
 #endif  // NDEBUG
   }
 
+  /// Applies an in-place mutator to every local, non-zero tile.
+
+  /// This is the engine behind the tensor-of-tensors \c fill* / \c
+  /// init_elements path: the array must already be shaped (every local tile
+  /// future registered, e.g. by the ToT range_fn constructor or \c
+  /// init_tiles_nested), and \p tile_op mutates each tile's data in place
+  /// without re-shaping it. \p tile_op must be callable as \c void(value_type&)
+  /// and thread safe. The mutation tasks chain off the existing tile futures,
+  /// so they run only after tile construction completes; this call blocks
+  /// locally until every mutation finishes, so on return all local tiles hold
+  /// their final data.
+  /// \tparam fence if Fence::Global, also fences the array's World on exit
+  /// \param[in] tile_op the in-place per-tile mutator
+  /// \return the number of tiles mutated
+  template <Fence fence = Fence::No, typename TileOp>
+  std::int64_t for_each_local_tile_inplace(TileOp&& tile_op) {
+    auto op = make_op_shared_handle(std::forward<TileOp>(tile_op));
+    World& w = world();
+    std::atomic<std::int64_t> ndone{0};
+    // hold the mutation-task futures so they (and the callbacks below) stay
+    // alive until every task has run; the futures are not re-set into the
+    // array -- the mutation happens behind the existing tile futures.
+    std::vector<Future<value_type>> done;
+    for (const auto& index : *(pmap())) {
+      if (is_zero(index)) continue;
+      Future<value_type>& fut = find_local(index);
+      Future<value_type> mutated = w.taskq.add(
+          [op](value_type& tile) -> value_type {
+            op(tile);
+            return tile;
+          },
+          fut);
+      mutated.register_callback(
+          new detail::IncrementCounter<std::atomic<std::int64_t>>(ndone));
+      done.emplace_back(std::move(mutated));
+    }
+    const std::int64_t ntiles = static_cast<std::int64_t>(done.size());
+    if (ntiles > 0)
+      w.await([&ndone, ntiles]() { return ndone.load() == ntiles; });
+    if constexpr (fence == Fence::Global) w.gop.fence();
+    return ntiles;
+  }
+
+  /// Builds one slab-backed arena tensor-of-tensors outer tile.
+
+  /// Engine behind the arena-ToT paths of \c init_elements and \c set:
+  /// \p cell_source(ordinal) returns a freestanding tensor whose range sizes
+  /// inner cell \p ordinal and whose data fills it. The slab is allocated by
+  /// \c detail::make_nested_tile and each cell deep-copies its source.
+  /// \param[in] outer_range the outer tile's range
+  /// \param[in] cell_source maps a cell ordinal to its source tensor
+  template <typename CellSource>
+  static value_type make_arena_nested_tile(const TiledArray::Range& outer_range,
+                                           CellSource&& cell_source) {
+    using InnerRange = typename element_type::range_type;
+    return detail::make_nested_tile<value_type>(
+        outer_range,
+        [&](const auto& idx) -> InnerRange {
+          // the inner-cell range type is built from an extent list -- it is
+          // not constructible from a foreign range type
+          const auto& src = cell_source(outer_range.ordinal(idx)).range();
+          const auto& src_ext = src.extent();
+          std::vector<std::size_t> ext(src.rank());
+          for (std::size_t d = 0; d < src.rank(); ++d)
+            ext[d] = static_cast<std::size_t>(src_ext[d]);
+          return InnerRange(ext);
+        },
+        [&](auto& cell, const auto& idx) {
+          cell = cell_source(outer_range.ordinal(idx));
+        });
+  }
+
   /// Code factorization of the actual assert for the other overloads
   void assert_pimpl() const {
     TA_ASSERT(pimpl_ &&
diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h
index 8196803152..45229f89c6 100644
--- a/src/TiledArray/einsum/tiledarray.h
+++ b/src/TiledArray/einsum/tiledarray.h
@@ -7,6 +7,7 @@
 #include "TiledArray/einsum/range.h"
 #include "TiledArray/expressions/fwd.h"
 #include "TiledArray/fwd.h"
+#include "TiledArray/tensor/arena_einsum.h"
 #include "TiledArray/tiled_range.h"
 #include "TiledArray/tiled_range1.h"
 
@@ -240,6 +241,34 @@ void replicate_tensor(Tensor &to, Tensor const &from) {
   // number of elements to be copied
   // (same as the number of elements in @c from)
   auto const N = from.range().volume();
+
+  if constexpr (TiledArray::is_arena_tensor_v<typename Tensor::value_type>) {
+    // arena ToT: an inner cell is an 8-byte view into the outer tile's slab.
+    // A plain std::copy of cells would leave `to` aliasing `from`'s slab --
+    // dangling once `from` is gone. Build `to` as a fresh slab-backed tile
+    // and deep-copy each replicated inner cell's element data.
+    using inner_t = typename Tensor::value_type;
+    using inner_range_t = typename inner_t::range_type;
+    using elem_t = typename inner_t::value_type;
+    const auto out_range = to.range();
+    const std::size_t M = out_range.volume();
+    auto range_fn = [&from, N](std::size_t ord) -> inner_range_t {
+      const auto &src = from.data()[ord % N];
+      return src.empty() ? inner_range_t{} : src.range();
+    };
+    to = detail::arena_outer_init<Tensor>(out_range, 1, range_fn,
+                                          alignof(elem_t), /*zero_init=*/false);
+    for (std::size_t ord = 0; ord < M; ++ord) {
+      auto &dst = to.data()[ord];
+      if (dst.empty()) continue;
+      const auto &src = from.data()[ord % N];
+      const elem_t *s = src.data();
+      elem_t *d = dst.data();
+      for (std::size_t k = 0; k < dst.size(); ++k) d[k] = s[k];
+    }
+    return;
+  }
+
   for (auto i = 0; i < to.range().volume(); i += N)
     std::copy(from.begin(), from.end(), to.data() + i);
 }
@@ -616,6 +645,14 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
     using ::Einsum::index::permutation;
     using TiledArray::Permutation;
 
+    // Temporary sub-Worlds used by the generalized-contraction path below.
+    // Declared before AB/C so it is destroyed *after* them: an ArrayTerm's
+    // `.ei` member is a DistArray bound to one of these sub-Worlds, and
+    // ~DistArray -> lazy_deleter dereferences that World. If a sub-World
+    // outlived only by `worlds` were torn down first, that deref would hit a
+    // dead World (e.g. while unwinding an exception thrown mid-contraction).
+    std::vector<std::shared_ptr<World>> worlds;
+
     std::tuple<ArrayTerm<ArrayA>, ArrayTerm<ArrayB>> AB{{A.array(), a},
                                                         {B.array(), b}};
 
@@ -686,6 +723,9 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
           std::is_same_v<TensorT,
                          typename decltype(A.array)::value_type::value_type>);
       constexpr bool is_tot = detail::is_tensor_v<TensorT>;
+      // A non-owning view inner cell (e.g. ArenaTensor) has no value-returning
+      // per-cell product; the legacy element-op path below cannot run for it.
+      constexpr bool inner_is_view = TiledArray::is_tensor_view_v<TensorT>;
       auto element_hadamard_op =
           (is_tot && inner.h)
               ? std::make_optional(
@@ -717,6 +757,8 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
 
       auto pa = A.permutation;
       auto pb = B.permutation;
+      auto arena_plan = detail::make_regime_a_arena_plan<ResultTensor>(
+          A, B, inner, /*inner_perm=*/C.permutation);
       for (Index h : H.tiles) {
         auto const pc = C.permutation;
         auto const c = apply(pc, h);
@@ -725,6 +767,9 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
         for (size_t i = 0; i < h.size(); ++i) {
           batch *= H.batch[i].at(h[i]);
         }
+        if (detail::run_regime_a_arena(arena_plan, h, batch, A, B, C,
+                                       C_local_tiles, tiles, trange))
+          continue;
         ResultTensor tile(TiledArray::Range{batch},
                           typename ResultTensor::value_type{});
         for (Index i : tiles) {
@@ -743,17 +788,28 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
           for (size_t k = 0; k < batch; ++k) {
             using Ix = ::Einsum::Index<std::string>;
             if constexpr (AreArrayToT<ArrayA, ArrayB>) {
-              auto aik = ai.batch(k);
-              auto bik = bi.batch(k);
-              auto vol = aik.total_size();
-              TA_ASSERT(vol == bik.total_size());
-
-              auto &el = tile({k});
-              using TensorT = std::remove_reference_t<decltype(el)>;
-
-              for (auto i = 0; i < vol; ++i)
-                add_to(el, element_product_op(aik.data()[i], bik.data()[i]));
-
+              if constexpr (inner_is_view) {
+                // View inner cells (e.g. ArenaTensor) have no value-returning
+                // per-cell product; only run_regime_a_arena can produce them.
+                // Reaching this legacy path means the arena plan was inactive
+                // -- typically a permuted inner contraction (see
+                // TODO(arena-einsum-perm) in arena_einsum.h).
+                TA_EXCEPTION(
+                    "TA::einsum: ToT x ToT product with view inner cells "
+                    "(e.g. ArenaTensor) is supported only via the regime-A "
+                    "arena fast path, which was inactive for this expression "
+                    "(likely a permuted inner contraction)");
+              } else {
+                auto aik = ai.batch(k);
+                auto bik = bi.batch(k);
+                auto vol = aik.total_size();
+                TA_ASSERT(vol == bik.total_size());
+
+                auto &el = tile({k});
+
+                for (auto i = 0; i < vol; ++i)
+                  add_to(el, element_product_op(aik.data()[i], bik.data()[i]));
+              }
             } else if constexpr (!AreArraySame<ArrayA, ArrayB>) {
               auto aik = ai.batch(k);
               auto bik = bi.batch(k);
@@ -762,11 +818,15 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
 
               auto &el = tile({k});
 
+              // Fused `el += inner_tensor * scalar` -- no scaled temporary
+              // (axpy_to works in-place, so it also supports view inner
+              // cells that cannot value-return a scaled tensor).
+              using TiledArray::axpy_to;
               for (auto i = 0; i < vol; ++i)
                 if constexpr (IsArrayToT<ArrayA>) {
-                  add_to(el, scale(aik.data()[i], bik.data()[i]));
+                  axpy_to(el, aik.data()[i], bik.data()[i]);
                 } else {
-                  add_to(el, scale(bik.data()[i], aik.data()[i]));
+                  axpy_to(el, bik.data()[i], aik.data()[i]);
                 }
 
             } else {
@@ -819,8 +879,6 @@ auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
     std::invoke(update_tr, std::get<0>(AB));
     std::invoke(update_tr, std::get<1>(AB));
 
-    std::vector<std::shared_ptr<World>> worlds;
-
     // iterates over tiles of hadamard indices
     for (Index h : H.tiles) {
       auto &[A, B] = AB;
diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index 946bf431b6..ee2f721aa3 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -30,6 +30,7 @@
 #include <TiledArray/expressions/binary_engine.h>
 #include <TiledArray/expressions/permopt.h>
 #include <TiledArray/proc_grid.h>
+#include <TiledArray/tensor/arena_einsum.h>
 #include <TiledArray/tensor/utility.h>
 #include <TiledArray/tile_op/contract_reduce.h>
 #include <TiledArray/tile_op/mult.h>
@@ -128,6 +129,17 @@ class ContEngine : public BinaryEngine<Derived> {
                                          const right_tile_element_type&)>
       element_return_op_;  ///< Same as element_nonreturn_op_ but returns
                            ///< the result
+  std::function<result_tile_type(const left_tile_type&,
+                                 const right_tile_type&)>
+      arena_hadamard_tile_op_;  ///< Whole-tile op for a Hadamard-outer +
+                                ///< contraction-inner product on arena
+                                ///< (view-inner-cell) ToT tiles, where a
+                                ///< value-returning per-cell op cannot be
+                                ///< used; null otherwise
+  using arena_plan_storage_t =
+      TiledArray::detail::arena_plan_storage_t<result_tile_type, left_tile_type,
+                                               right_tile_type>;
+  TA_NO_UNIQUE_ADDRESS arena_plan_storage_t arena_plan_;
   TiledArray::detail::ProcGrid
       proc_grid_;    ///< Process grid for the contraction
   size_type K_ = 1;  ///< Inner dimension size
@@ -253,7 +265,10 @@ class ContEngine : public BinaryEngine<Derived> {
     // 1. if ToT inner tile op has been initialized
     if constexpr (TiledArray::detail::is_tensor_of_tensor_v<value_type>) {
       TA_ASSERT(element_nonreturn_op_);
-      TA_ASSERT(element_return_op_);
+      // a view inner cell (e.g. ArenaTensor) cannot host a value-returning
+      // inner op, so element_return_op_ is intentionally left null for it
+      if constexpr (!TiledArray::is_tensor_view_v<result_tile_element_type>)
+        TA_ASSERT(element_return_op_);
     }
 
     // Initialize children
@@ -300,7 +315,13 @@ class ContEngine : public BinaryEngine<Derived> {
         // factor_ is absorbed into inner_tile_nonreturn_op_
         op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_),
                       outer_size(left_indices_), outer_size(right_indices_),
-                      total_perm, this->element_nonreturn_op_);
+                      total_perm, this->element_nonreturn_op_,
+                      std::move(this->arena_plan_));
+        // Plan ownership transferred to op_; mark carrier slot empty so any
+        // later use of arena_plan_ reads as "no plan" rather than moved-from.
+        if constexpr (!std::is_same_v<arena_plan_storage_t, std::monostate>) {
+          this->arena_plan_.reset();
+        }
       }
       trange_ = ContEngine_::make_trange(outer_perm);
       shape_ = ContEngine_::make_shape(outer_perm);
@@ -330,7 +351,13 @@ class ContEngine : public BinaryEngine<Derived> {
         // factor_ is absorbed into inner_tile_nonreturn_op_
         op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_),
                       outer_size(left_indices_), outer_size(right_indices_),
-                      total_perm, this->element_nonreturn_op_);
+                      total_perm, this->element_nonreturn_op_,
+                      std::move(this->arena_plan_));
+        // Plan ownership transferred to op_; mark carrier slot empty so any
+        // later use of arena_plan_ reads as "no plan" rather than moved-from.
+        if constexpr (!std::is_same_v<arena_plan_storage_t, std::monostate>) {
+          this->arena_plan_.reset();
+        }
       }
       trange_ = ContEngine_::make_trange();
       shape_ = ContEngine_::make_shape();
@@ -513,6 +540,110 @@ class ContEngine : public BinaryEngine<Derived> {
 
  protected:
   void init_inner_tile_op(const IndexList& inner_target_indices) {
+    if constexpr (TiledArray::detail::is_tensor_of_tensor_v<result_tile_type>) {
+      constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v<
+          result_tile_type, left_tile_type, right_tile_type>;
+      if constexpr (tot_x_tot &&
+                    TiledArray::is_tensor_view_v<result_tile_element_type>) {
+        // ToT x ToT with non-owning view inner cells (e.g. ArenaTensor). A
+        // view cell cannot host a value-returning inner op, so the
+        // owning-cell inner-op builder cannot be used. Two nested products
+        // are supported here:
+        //  - the elementwise pure Hadamard, where the inner element op is
+        //    unused anyway -- MultEngine::make_tile_op passes none and the
+        //    outer Mult tile op recurses through Tensor<view>::mult -- so
+        //    element_*_op_ is left null;
+        //  - the inner contraction (incl. inner outer-product), routed
+        //    through the arena fast path: it writes results in place into
+        //    pre-shaped view cells, so only element_nonreturn_op_ is needed.
+        // Every other nested product is deferred.
+        const auto inner_prod = this->inner_product_type();
+        if (inner_prod == TensorProduct::Hadamard &&
+            this->product_type() == TensorProduct::Hadamard) {
+          // pure Hadamard: element_*_op_ left null
+        } else if (inner_prod == TensorProduct::Contraction) {
+          using op_type = TiledArray::detail::ContractReduce<
+              result_tile_element_type, left_tile_element_type,
+              right_tile_element_type, scalar_type>;
+          // The inner op is built *perm-free* on purpose. factor_ is absorbed
+          // into element_nonreturn_op_; operand inner transposes are folded
+          // into the inner GEMM via left_/right_inner_permtype_. A non-identity
+          // inner *result* permutation is NOT placed on this op
+          // (make_fused_contraction_lambda asserts a perm-free op); it is
+          // applied downstream instead -- by op_'s post-processing permute for
+          // a contraction outer product, or by arena_hadamard_inner_contract's
+          // slab-level post-pass for a Hadamard outer product.
+          auto contrreduce_op = op_type(
+              to_cblas_op(this->left_inner_permtype_),
+              to_cblas_op(this->right_inner_permtype_), this->factor_,
+              inner_size(this->indices_), inner_size(this->left_indices_),
+              inner_size(this->right_indices_));
+          constexpr bool arena_eligible =
+              TiledArray::detail::is_contraction_arena_tot_v<
+                  result_tile_type, left_tile_type, right_tile_type>;
+          if constexpr (!arena_eligible) {
+            TA_EXCEPTION(
+                "nested contraction on view inner tiles is supported only "
+                "for arena-backed tensors-of-tensors");
+          } else {
+            // perm-free per-cell in-place contraction; used by both outer
+            // regimes below
+            this->element_nonreturn_op_ =
+                TiledArray::detail::make_fused_contraction_lambda<
+                    result_tile_element_type, left_tile_element_type,
+                    right_tile_element_type>(contrreduce_op);
+            if (this->product_type() == TensorProduct::Contraction) {
+              // outer contraction: the SUMMA result is shaped from operand
+              // inner cells by arena_plan_; op_'s post-processing permute
+              // applies the (outer + inner) result permutation.
+              this->arena_plan_ =
+                  TiledArray::detail::make_contraction_arena_plan<
+                      result_tile_type, left_tile_type, right_tile_type>(
+                      TiledArray::detail::ArenaInnerShapeKind::
+                          gemm_result_range,
+                      std::make_optional(contrreduce_op.gemm_helper()),
+                      Permutation{});
+              if (!bool(this->arena_plan_))
+                TA_EXCEPTION(
+                    "nested contraction on view inner tiles: the arena fast "
+                    "path was inactive (arena disabled)");
+            } else {
+              // outer Hadamard: MultEngine builds a binary tile op, which
+              // cannot use a value-returning per-cell op. Supply a whole-tile
+              // arena op that shapes the result from per-cell inner GEMMs and
+              // fills it in place; the inner result permutation is a
+              // slab-level post-pass inside the kernel.
+              this->arena_hadamard_tile_op_ =
+                  [cell_op = this->element_nonreturn_op_,
+                   inner_gh = contrreduce_op.gemm_helper(),
+                   inner_perm = inner(this->perm_)](
+                      const left_tile_type& l,
+                      const right_tile_type& r) -> result_tile_type {
+                return TiledArray::detail::arena_hadamard_inner_contract<
+                    result_tile_type>(l, r, inner_gh, cell_op, inner_perm);
+              };
+            }
+          }
+          // element_return_op_ left null: a view cell cannot be
+          // value-returned (see the init_struct precondition check).
+        } else {
+          TA_EXCEPTION(
+              "nested non-contraction product on view inner tiles (e.g. "
+              "ArenaTensor) is not yet supported; only the elementwise "
+              "Hadamard product and the inner contraction are");
+        }
+      } else {
+        init_inner_tile_op_owning_(inner_target_indices);
+      }
+    }
+  }
+
+  /// Builds the inner-cell element op (element_nonreturn_op_ /
+  /// element_return_op_) for a nested-tensor expression. init_inner_tile_op
+  /// dispatches every case here except ToT x ToT with non-owning view inner
+  /// cells -- a view cell cannot host the value-returning inner ops this
+  /// builder constructs.
+  void init_inner_tile_op_owning_(const IndexList& inner_target_indices) {
     if constexpr (TiledArray::detail::is_tensor_of_tensor_v<result_tile_type>) {
       constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v<
           result_tile_type, left_tile_type, right_tile_type>;
@@ -541,17 +672,52 @@ class ContEngine : public BinaryEngine<Derived> {
                             this->factor_, inner_size(this->indices_),
                             inner_size(this->left_indices_),
                             inner_size(this->right_indices_));
-          this->element_nonreturn_op_ =
-              [contrreduce_op, permute_inner = this->product_type() !=
-                                               TensorProduct::Contraction](
-                  result_tile_element_type& result,
-                  const left_tile_element_type& left,
-                  const right_tile_element_type& right) {
-                contrreduce_op(result, left, right);
-                // permutations of result are applied as "postprocessing"
-                if (permute_inner && !TA::empty(result))
-                  result = contrreduce_op(result);
-              };
+          constexpr bool arena_eligible =
+              TiledArray::detail::is_contraction_arena_tot_v<
+                  result_tile_type, left_tile_type, right_tile_type>;
+          if constexpr (arena_eligible) {
+            if (this->product_type() == TensorProduct::Contraction) {
+              this->arena_plan_ =
+                  TiledArray::detail::make_contraction_arena_plan<
+                      result_tile_type, left_tile_type, right_tile_type>(
+                      TiledArray::detail::ArenaInnerShapeKind::
+                          gemm_result_range,
+                      std::make_optional(contrreduce_op.gemm_helper()),
+                      inner(this->perm_));
+            }
+          }
+          if constexpr (arena_eligible) {
+            if (this->arena_plan_) {
+              this->element_nonreturn_op_ =
+                  TiledArray::detail::make_fused_contraction_lambda<
+                      result_tile_element_type, left_tile_element_type,
+                      right_tile_element_type>(contrreduce_op);
+            } else {
+              this->element_nonreturn_op_ =
+                  [contrreduce_op, permute_inner = this->product_type() !=
+                                                   TensorProduct::Contraction](
+                      result_tile_element_type& result,
+                      const left_tile_element_type& left,
+                      const right_tile_element_type& right) {
+                    contrreduce_op(result, left, right);
+                    // permutations of result are applied as "postprocessing"
+                    if (permute_inner && !TA::empty(result))
+                      result = contrreduce_op(result);
+                  };
+            }
+          } else {
+            this->element_nonreturn_op_ =
+                [contrreduce_op, permute_inner = this->product_type() !=
+                                                 TensorProduct::Contraction](
+                    result_tile_element_type& result,
+                    const left_tile_element_type& left,
+                    const right_tile_element_type& right) {
+                  contrreduce_op(result, left, right);
+                  // permutations of result are applied as "postprocessing"
+                  if (permute_inner && !TA::empty(result))
+                    result = contrreduce_op(result);
+                };
+          }
         }  // ToT x ToT
       } else if (inner_prod == TensorProduct::Hadamard) {
         TA_ASSERT(tot_x_tot);
@@ -574,26 +740,69 @@ class ContEngine : public BinaryEngine<Derived> {
                                                   ? inner(this->perm_)
                                                   : Permutation{})
                     : op_type(base_op_type());
-            this->element_nonreturn_op_ =
-                [mult_op, outer_prod](result_tile_element_type& result,
-                                      const left_tile_element_type& left,
-                                      const right_tile_element_type& right) {
-                  TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
-                            outer_prod == TensorProduct::Contraction);
-                  if (outer_prod == TensorProduct::Hadamard)
-                    result = mult_op(left, right);
-                  else {  // outer_prod == TensorProduct::Contraction
-                    // there is currently no fused MultAdd ternary Op, only Add
-                    // and Mult thus implement this as 2 separate steps
-                    // TODO optimize by implementing (ternary) MultAdd
-                    if (empty(result))
+            constexpr bool arena_eligible_h_unit =
+                TiledArray::detail::is_contraction_arena_tot_v<
+                    result_tile_type, left_tile_type, right_tile_type>;
+            if constexpr (arena_eligible_h_unit) {
+              if (this->product_type() == TensorProduct::Contraction) {
+                this->arena_plan_ =
+                    TiledArray::detail::make_contraction_arena_plan<
+                        result_tile_type, left_tile_type, right_tile_type>(
+                        TiledArray::detail::ArenaInnerShapeKind::left_range,
+                        std::nullopt, inner(this->perm_));
+              }
+            }
+            if constexpr (arena_eligible_h_unit) {
+              if (this->arena_plan_) {
+                this->element_nonreturn_op_ =
+                    TiledArray::detail::make_fused_hadamard_lambda<
+                        result_tile_element_type, left_tile_element_type,
+                        right_tile_element_type>();
+              } else {
+                this->element_nonreturn_op_ =
+                    [mult_op, outer_prod](
+                        result_tile_element_type& result,
+                        const left_tile_element_type& left,
+                        const right_tile_element_type& right) {
+                      TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
+                                outer_prod == TensorProduct::Contraction);
+                      if (outer_prod == TensorProduct::Hadamard)
+                        result = mult_op(left, right);
+                      else {  // outer_prod == TensorProduct::Contraction
+                        // there is currently no fused MultAdd ternary Op, only
+                        // Add and Mult thus implement this as 2 separate steps
+                        // TODO optimize by implementing (ternary) MultAdd
+                        if (empty(result))
+                          result = mult_op(left, right);
+                        else {
+                          auto result_increment = mult_op(left, right);
+                          add_to(result, result_increment);
+                        }
+                      }
+                    };
+              }
+            } else {
+              this->element_nonreturn_op_ =
+                  [mult_op, outer_prod](result_tile_element_type& result,
+                                        const left_tile_element_type& left,
+                                        const right_tile_element_type& right) {
+                    TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
+                              outer_prod == TensorProduct::Contraction);
+                    if (outer_prod == TensorProduct::Hadamard)
                       result = mult_op(left, right);
-                    else {
-                      auto result_increment = mult_op(left, right);
-                      add_to(result, result_increment);
+                    else {  // outer_prod == TensorProduct::Contraction
+                      // there is currently no fused MultAdd ternary Op, only
+                      // Add and Mult thus implement this as 2 separate steps
+                      // TODO optimize by implementing (ternary) MultAdd
+                      if (empty(result))
+                        result = mult_op(left, right);
+                      else {
+                        auto result_increment = mult_op(left, right);
+                        add_to(result, result_increment);
+                      }
                     }
-                  }
-                };
+                  };
+            }
           } else {
             using base_op_type = TiledArray::detail::ScalMult<
                 result_tile_element_type, left_tile_element_type,
@@ -607,26 +816,69 @@ class ContEngine : public BinaryEngine<Derived> {
                                              ? inner(this->perm_)
                                              : Permutation{})
                                : op_type(base_op_type(this->factor_));
-            this->element_nonreturn_op_ =
-                [mult_op, outer_prod](result_tile_element_type& result,
-                                      const left_tile_element_type& left,
-                                      const right_tile_element_type& right) {
-                  TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
-                            outer_prod == TensorProduct::Contraction);
-                  if (outer_prod == TensorProduct::Hadamard)
-                    result = mult_op(left, right);
-                  else {
-                    // there is currently no fused MultAdd ternary Op, only Add
-                    // and Mult thus implement this as 2 separate steps
-                    // TODO optimize by implementing (ternary) MultAdd
-                    if (empty(result))
+            constexpr bool arena_eligible_h_scaled =
+                TiledArray::detail::is_contraction_arena_tot_v<
+                    result_tile_type, left_tile_type, right_tile_type>;
+            if constexpr (arena_eligible_h_scaled) {
+              if (this->product_type() == TensorProduct::Contraction) {
+                this->arena_plan_ =
+                    TiledArray::detail::make_contraction_arena_plan<
+                        result_tile_type, left_tile_type, right_tile_type>(
+                        TiledArray::detail::ArenaInnerShapeKind::left_range,
+                        std::nullopt, inner(this->perm_));
+              }
+            }
+            if constexpr (arena_eligible_h_scaled) {
+              if (this->arena_plan_) {
+                this->element_nonreturn_op_ =
+                    TiledArray::detail::make_fused_hadamard_scaled_lambda<
+                        result_tile_element_type, left_tile_element_type,
+                        right_tile_element_type>(this->factor_);
+              } else {
+                this->element_nonreturn_op_ =
+                    [mult_op, outer_prod](
+                        result_tile_element_type& result,
+                        const left_tile_element_type& left,
+                        const right_tile_element_type& right) {
+                      TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
+                                outer_prod == TensorProduct::Contraction);
+                      if (outer_prod == TensorProduct::Hadamard)
+                        result = mult_op(left, right);
+                      else {
+                        // there is currently no fused MultAdd ternary Op, only
+                        // Add and Mult thus implement this as 2 separate steps
+                        // TODO optimize by implementing (ternary) MultAdd
+                        if (empty(result))
+                          result = mult_op(left, right);
+                        else {
+                          auto result_increment = mult_op(left, right);
+                          add_to(result, result_increment);
+                        }
+                      }
+                    };
+              }
+            } else {
+              this->element_nonreturn_op_ =
+                  [mult_op, outer_prod](result_tile_element_type& result,
+                                        const left_tile_element_type& left,
+                                        const right_tile_element_type& right) {
+                    TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
+                              outer_prod == TensorProduct::Contraction);
+                    if (outer_prod == TensorProduct::Hadamard)
                       result = mult_op(left, right);
                     else {
-                      auto result_increment = mult_op(left, right);
-                      add_to(result, result_increment);
+                      // there is currently no fused MultAdd ternary Op, only
+                      // Add and Mult thus implement this as 2 separate steps
+                      // TODO optimize by implementing (ternary) MultAdd
+                      if (empty(result))
+                        result = mult_op(left, right);
+                      else {
+                        auto result_increment = mult_op(left, right);
+                        add_to(result, result_increment);
+                      }
                     }
-                  }
-                };
+                  };
+            }
           }
         }  // ToT x T or T x ToT
       } else if (inner_prod == TensorProduct::Scale) {
@@ -640,44 +892,81 @@ class ContEngine : public BinaryEngine<Derived> {
                                                       right_tile_type> &&
             TiledArray::detail::is_tensor_v<left_tile_type>;
         if constexpr (tot_x_t || t_x_tot) {
-          auto scal_op = [perm = !this->implicit_permute_inner_
-                                     ? inner(this->perm_)
-                                     : Permutation{}](
-                             const left_tile_element_type& left,
-                             const right_tile_element_type& right)
-              -> result_tile_element_type {
-            using TiledArray::scale;
-            if constexpr (tot_x_t) {
-              if (perm)
-                return scale(left, right, perm);
-              else
-                return scale(left, right);
-            } else if constexpr (t_x_tot) {
-              if (perm)
-                return scale(right, left, perm);
-              else
-                return scale(right, left);
-            } else
-              abort();  // unreachable
+          constexpr auto kind =
+              tot_x_t ? TiledArray::detail::ArenaInnerShapeKind::left_range
+                      : TiledArray::detail::ArenaInnerShapeKind::right_range;
+          constexpr bool arena_eligible_scale =
+              TiledArray::detail::is_contraction_arena_tot_v<
+                  result_tile_type, left_tile_type, right_tile_type>;
+          if constexpr (arena_eligible_scale) {
+            if (this->product_type() == TensorProduct::Contraction) {
+              this->arena_plan_ =
+                  TiledArray::detail::make_contraction_arena_plan<
+                      result_tile_type, left_tile_type, right_tile_type>(
+                      kind, std::nullopt, inner(this->perm_));
+            }
+          }
+          // Fallback per-element op for the scale inner-product when no
+          // arena plan is in play. The Contraction outer product is the
+          // fused AXPY `result += (perm ^ tot) * scalar` -- no scaled
+          // temporary, so it works uniformly for owning and view inner
+          // cells. The Hadamard outer product is an assignment
+          // `result = (perm ^ tot) * scalar`, which needs value-returning
+          // `scale`; only owning inner cells support it.
+          auto fallback_op = [perm = !this->implicit_permute_inner_
+                                         ? inner(this->perm_)
+                                         : Permutation{},
+                              outer_prod = this->product_type()](
+                                 result_tile_element_type& result,
+                                 const left_tile_element_type& left,
+                                 const right_tile_element_type& right) {
+            if (outer_prod == TensorProduct::Contraction) {
+              using TiledArray::axpy_to;
+              if constexpr (tot_x_t) {
+                if (perm)
+                  axpy_to(result, left, right, perm);
+                else
+                  axpy_to(result, left, right);
+              } else {
+                if (perm)
+                  axpy_to(result, right, left, perm);
+                else
+                  axpy_to(result, right, left);
+              }
+            } else {
+              if constexpr (!TiledArray::is_tensor_view_v<
+                                result_tile_element_type>) {
+                using TiledArray::scale;
+                if constexpr (tot_x_t)
+                  result = perm ? scale(left, right, perm) : scale(left, right);
+                else
+                  result = perm ? scale(right, left, perm) : scale(right, left);
+              } else {
+                TA_EXCEPTION(
+                    "Tensor<View> scale-inner Hadamard-outer product: a "
+                    "view result cell cannot be value-assigned a fresh "
+                    "scaled tensor");
+              }
+            }
           };
-          this->element_nonreturn_op_ =
-              [scal_op, outer_prod = (this->product_type())](
-                  result_tile_element_type& result,
-                  const left_tile_element_type& left,
-                  const right_tile_element_type& right) {
-                if (outer_prod == TensorProduct::Contraction) {
-                  // TODO implement X-permuting AXPY
-                  if (empty(result))
-                    result = scal_op(left, right);
-                  else {
-                    auto result_increment = scal_op(left, right);
-                    add_to(result, result_increment);
-                  }
-                  // result += scal_op(left, right);
-                } else {
-                  result = scal_op(left, right);
-                }
-              };
+          if constexpr (arena_eligible_scale) {
+            if (this->arena_plan_) {
+              if constexpr (tot_x_t)
+                this->element_nonreturn_op_ =
+                    TiledArray::detail::make_fused_scale_tot_x_t_lambda<
+                        result_tile_element_type, left_tile_element_type,
+                        right_tile_element_type>();
+              else
+                this->element_nonreturn_op_ =
+                    TiledArray::detail::make_fused_scale_t_x_tot_lambda<
+                        result_tile_element_type, left_tile_element_type,
+                        right_tile_element_type>();
+            } else {
+              this->element_nonreturn_op_ = fallback_op;
+            }
+          } else {
+            this->element_nonreturn_op_ = fallback_op;
+          }
         }
       } else
         abort();  // unsupported TensorProduct type
diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h
index 84d11bd4c0..f0942ee48d 100644
--- a/src/TiledArray/expressions/mult_engine.h
+++ b/src/TiledArray/expressions/mult_engine.h
@@ -408,7 +408,16 @@ class MultEngine : public ContEngine<MultEngine<Left, Right, Result>> {
                                 // dimensions as well
         return op_type(op_base_type());
       } else if (inner_prod == TensorProduct::Contraction) {
-        return op_type(op_base_type(this->element_return_op_));
+        if constexpr (TiledArray::is_tensor_view_v<
+                          typename value_type::value_type>) {
+          // arena ToT: a view inner cell cannot host a value-returning
+          // per-cell op, so delegate the whole tile product to the arena op
+          // built in init_inner_tile_op
+          return op_type(op_base_type(typename op_base_type::tile_op_tag{},
+                                      this->arena_hadamard_tile_op_));
+        } else {
+          return op_type(op_base_type(this->element_return_op_));
+        }
       } else if (inner_prod == TensorProduct::Scale) {
         return op_type(op_base_type());
       } else
@@ -438,13 +447,30 @@ class MultEngine : public ContEngine<MultEngine<Left, Right, Result>> {
       } else if (inner_prod == TensorProduct::Contraction) {
         // inner permutation, if needed, was fused into inner op, do not apply
         // inner part of the perm again
-        return op_type(op_base_type(this->element_return_op_),
-                       outer(std::forward<Perm>(perm)));
+        if constexpr (TiledArray::is_tensor_view_v<
+                          typename value_type::value_type>) {
+          return op_type(op_base_type(typename op_base_type::tile_op_tag{},
+                                      this->arena_hadamard_tile_op_),
+                         outer(std::forward<Perm>(perm)));
+        } else {
+          return op_type(op_base_type(this->element_return_op_),
+                         outer(std::forward<Perm>(perm)));
+        }
       } else if (inner_prod == TensorProduct::Scale) {
-        // inner permutation, if needed, was fused into inner op, do not apply
-        // inner part of the perm again
-        return op_type(op_base_type(this->element_return_op_),
-                       outer(std::forward<Perm>(perm)));
+        if constexpr (TiledArray::is_tensor_view_v<
+                          typename value_type::value_type>) {
+          // arena ToT: a view result cell cannot be value-assigned a scaled
+          // tensor, so the element_return_op_ path is unusable. Route through
+          // the arena-aware mult CPO with the full permutation instead -- it
+          // shapes and fills the result tile as a unit and applies the
+          // (outer + inner) result permutation in place.
+          return op_type(op_base_type(), std::forward<Perm>(perm));
+        } else {
+          // inner permutation, if needed, was fused into inner op, do not
+          // apply inner part of the perm again
+          return op_type(op_base_type(this->element_return_op_),
+                         outer(std::forward<Perm>(perm)));
+        }
       } else
         abort();
     } else {  // plain tensor
diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h
index d8841a8596..009d32f9b2 100644
--- a/src/TiledArray/external/btas.h
+++ b/src/TiledArray/external/btas.h
@@ -1223,6 +1223,9 @@ namespace TiledArray {
 namespace detail {
 template <typename T, typename... Args>
 struct ta_ops_match_tensor<::btas::Tensor<T, Args...>> : std::false_type {};
+template <typename T, typename... Args>
+struct ta_ops_match_tensor_inplace<::btas::Tensor<T, Args...>>
+    : std::false_type {};
 }  // namespace detail
 }  // namespace TiledArray
 
@@ -1238,6 +1241,12 @@ template <typename T>
 inline constexpr bool ta_ops_match_tensor_v =
     ::TiledArray::detail::is_btas_tensor_v<
         ::TiledArray::detail::remove_cvr_t<T>>;
+// btas::Tensor is freestanding (owning); the compound-assignment predicate is
+// identical to the value-returning one.
+template <typename T>
+inline constexpr bool ta_ops_match_tensor_inplace_v =
+    ::TiledArray::detail::is_btas_tensor_v<
+        ::TiledArray::detail::remove_cvr_t<T>>;
 }  // namespace detail
 
 #include <TiledArray/tensor/operators_body.ipp>
diff --git a/src/TiledArray/replicator.h b/src/TiledArray/replicator.h
index 52ae446af1..8794954a20 100644
--- a/src/TiledArray/replicator.h
+++ b/src/TiledArray/replicator.h
@@ -166,8 +166,12 @@ class Replicator : public madness::WorldObject<Replicator<A> >,
     // Generate a list of local tiles from other.
     typename A::pmap_interface::const_iterator end = source.pmap()->end();
     typename A::pmap_interface::const_iterator it = source.pmap()->begin();
-    indices_.reserve(source.pmap()->local_size());
-    data_.reserve(source.pmap()->local_size());
+    // local_size() is only a reserve() hint; some pmaps (e.g. HashPmap) do
+    // not precompute it -- skip the hint rather than assert.
+    if (source.pmap()->known_local_size()) {
+      indices_.reserve(source.pmap()->local_size());
+      data_.reserve(source.pmap()->local_size());
+    }
     if (source.is_dense()) {
       // When dense, all tiles are present
       for (; it != end; ++it) {
diff --git a/src/TiledArray/tensor/arena.h b/src/TiledArray/tensor/arena.h
new file mode 100644
index 0000000000..b37b962436
--- /dev/null
+++ b/src/TiledArray/tensor/arena.h
@@ -0,0 +1,160 @@
+/// Arena implementation
+#ifndef TILEDARRAY_TENSOR_ARENA_H__INCLUDED
+#define TILEDARRAY_TENSOR_ARENA_H__INCLUDED
+
+#include "TiledArray/config.h"
+#include "TiledArray/error.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <memory_resource>
+#include <utility>
+#include <vector>
+
+namespace TiledArray {
+namespace detail {
+
+/// Kill switch: when true, hooks fall back to the legacy heap path.
+inline bool& arena_disabled() {
+  static bool flag = false;
+  return flag;
+}
+
+/// One-shot bump allocator; slab is co-owned via aliasing shared_ptrs.
+class Arena {
+ public:
+  explicit Arena(
+      std::pmr::memory_resource* mr = std::pmr::new_delete_resource()) noexcept
+      : resource_(mr) {
+    TA_ASSERT(resource_ != nullptr);
+  }
+
+  Arena(const Arena&) = delete;
+  Arena& operator=(const Arena&) = delete;
+  Arena(Arena&&) noexcept = default;
+  Arena& operator=(Arena&&) noexcept = default;
+  ~Arena() = default;
+
+  /// Allocate the slab once; zero_init clears it for accumulation kernels.
+  /// `alignment` (default `alignof(std::max_align_t)`) is the alignment of
+  /// the slab base; pass a larger power-of-two when callers need SIMD-aligned
+  /// element pointers at known interior offsets.
+  void reserve(std::size_t bytes, bool zero_init = false,
+               std::size_t alignment = alignof(std::max_align_t)) {
+    TA_ASSERT(capacity_ == 0);
+    TA_ASSERT(bytes > 0);
+    TA_ASSERT(alignment >= alignof(std::max_align_t));
+    TA_ASSERT((alignment & (alignment - 1)) == 0);
+    void* raw = resource_->allocate(bytes, alignment);
+    auto* mr = resource_;
+    auto deleter = [mr, bytes, alignment](std::byte* p) noexcept {
+      mr->deallocate(p, bytes, alignment);
+    };
+    slab_ = std::shared_ptr<std::byte[]>(static_cast<std::byte*>(raw),
+                                         std::move(deleter));
+    capacity_ = bytes;
+    cursor_ = 0;
+    if (zero_init) std::memset(slab_.get(), 0, bytes);
+  }
+
+  /// Aliasing view at a caller-aligned offset.
+  template <typename T>
+  std::shared_ptr<T[]> slice(std::size_t offset, std::size_t /*n_elem*/) const {
+    TA_ASSERT(slab_);
+    TA_ASSERT(offset % alignof(T) == 0);
+    TA_ASSERT(offset <= capacity_);
+    auto* p = reinterpret_cast<T*>(slab_.get() + offset);
+    return std::shared_ptr<T[]>(slab_, p);
+  }
+
+  /// Bump-allocate n elements of T; result is T-aligned.
+  template <typename T>
+  std::shared_ptr<T[]> claim(std::size_t n) {
+    TA_ASSERT(slab_);
+    auto base = reinterpret_cast<std::uintptr_t>(slab_.get() + cursor_);
+    auto aligned = (base + alignof(T) - 1) & ~(alignof(T) - 1);
+    std::size_t pad = static_cast<std::size_t>(aligned - base);
+    std::size_t consumed = pad + n * sizeof(T);
+    TA_ASSERT(cursor_ + consumed <= capacity_);
+    cursor_ += consumed;
+    return std::shared_ptr<T[]>(slab_, reinterpret_cast<T*>(aligned));
+  }
+
+  std::size_t capacity() const noexcept { return capacity_; }
+  std::size_t cursor() const noexcept { return cursor_; }
+  std::size_t remaining() const noexcept { return capacity_ - cursor_; }
+  bool empty() const noexcept { return cursor_ == 0; }
+  std::pmr::memory_resource* resource() const noexcept { return resource_; }
+
+ private:
+  std::pmr::memory_resource* resource_;
+  std::shared_ptr<std::byte[]> slab_;
+  std::size_t capacity_ = 0;
+  std::size_t cursor_ = 0;
+};
+
+/// Per-cell offsets and total slab size produced by plan().
+struct ArenaPlan {
+  std::vector<std::size_t> offsets;
+  std::size_t total_bytes = 0;
+};
+
+/// Cache-line-floor alignment used by production callers.
+inline constexpr std::size_t kArenaCachelineAlign = 128;
+
+/// Round bytes up to a power-of-two alignment.
+inline std::size_t arena_align_up(std::size_t bytes,
+                                  std::size_t alignment) noexcept {
+  return (bytes + alignment - 1) & ~(alignment - 1);
+}
+
+/// Pre-walk cells once to compute offsets and total bytes.
+template <typename ShapeFn>
+ArenaPlan plan(std::size_t N_cells, ShapeFn&& shape_fn,
+               std::size_t element_size, std::size_t alignment) {
+  ArenaPlan out;
+  out.offsets.resize(N_cells);
+  std::size_t total = 0;
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    out.offsets[ord] = total;
+    auto&& r = shape_fn(ord);
+    std::size_t bytes = r.volume() * element_size;
+    total += arena_align_up(bytes, alignment);
+  }
+  out.total_bytes = total;
+  return out;
+}
+
+/// PMR adapter over an Arena; deallocation is a no-op (slab-owned lifetime).
+class ArenaResource final : public std::pmr::memory_resource {
+ public:
+  explicit ArenaResource(Arena* arena) noexcept : arena_(arena) {
+    TA_ASSERT(arena != nullptr);
+  }
+
+  Arena* arena() const noexcept { return arena_; }
+
+ protected:
+  void* do_allocate(std::size_t bytes, std::size_t alignment) override {
+    auto h = arena_->claim<std::byte>(arena_align_up(bytes, alignment));
+    return h.get();
+  }
+
+  void do_deallocate(void* /*p*/, std::size_t /*bytes*/,
+                     std::size_t /*alignment*/) override {}
+
+  bool do_is_equal(
+      const std::pmr::memory_resource& other) const noexcept override {
+    return this == &other;
+  }
+
+ private:
+  Arena* arena_;
+};
+
+}  // namespace detail
+}  // namespace TiledArray
+
+#endif
diff --git a/src/TiledArray/tensor/arena_einsum.h b/src/TiledArray/tensor/arena_einsum.h
new file mode 100644
index 0000000000..7e917e2bf8
--- /dev/null
+++ b/src/TiledArray/tensor/arena_einsum.h
@@ -0,0 +1,793 @@
+/// Arena-aware ToT einsum: plans, fused kernels, and dispatch.
+
+#ifndef TILEDARRAY_TENSOR_ARENA_EINSUM_H__INCLUDED
+#define TILEDARRAY_TENSOR_ARENA_EINSUM_H__INCLUDED
+
+#include "TiledArray/error.h"
+#include "TiledArray/math/gemm_helper.h"
+#include "TiledArray/permutation.h"
+#include "TiledArray/tensor/arena.h"
+#include "TiledArray/tensor/arena_kernels.h"
+#include "TiledArray/tensor/kernels.h"
+#include "TiledArray/tensor/type_traits.h"
+
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#if defined(_MSC_VER) && _MSC_VER < 1937  // VS 2022 < 17.7
+#define TA_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
+#else
+#define TA_NO_UNIQUE_ADDRESS [[no_unique_address]]
+#endif
+
+namespace TiledArray::detail {
+
+/// Specifies how an inner-cell range is derived from operand inner cells.
+enum class ArenaInnerShapeKind {
+  left_range,        // Hadamard inner; Scale tot_x_t
+  right_range,       // Scale t_x_tot
+  gemm_result_range  // inner Contraction (uses inner_gh)
+};
+
+/// Inner-shape derivation plan: kind + (optional) inner GemmHelper.
+struct ArenaInnerShapePlan {
+  ArenaInnerShapeKind kind;
+  std::optional<math::GemmHelper> inner_gh;  // only for gemm_result_range
+
+  /// Derives one result inner range from operand inner cells.
+  template <typename ResultInnerRange, typename LInner, typename RInner>
+  ResultInnerRange make(const LInner& l, const RInner& r) const {
+    switch (kind) {
+      case ArenaInnerShapeKind::left_range:
+        return l.range();
+      case ArenaInnerShapeKind::right_range:
+        return r.range();
+      case ArenaInnerShapeKind::gemm_result_range:
+        TA_ASSERT(inner_gh.has_value());
+        return inner_gh->template make_result_range<ResultInnerRange>(
+            l.range(), r.range());
+    }
+    TA_ASSERT(false);
+    return ResultInnerRange{};
+  }
+};
+
+/// Derives result ranges and constructs non-empty inner cells in one arena
+/// slab.
+template <typename Result, typename Left, typename Right>
+class ContractionArenaPlan {
+ public:
+  /// Stores the inner shape plan used to construct result cells.
+  explicit ContractionArenaPlan(ArenaInnerShapePlan p)
+      : inner_plan_(std::move(p)) {}
+
+  /// Constructs a result tile whose non-empty inner cells alias arena storage.
+  Result reserve_and_construct(const Left& left, const Right& right,
+                               const math::GemmHelper& outer_gh) const;
+
+  /// Grows an already-constructed result tile in place so it covers every
+  /// inner cell implied by this `left`/`right` K-panel. A SUMMA reduction
+  /// shapes the result from its first K-panel only; a later panel of a
+  /// contracted-dimension-sparse ToT operand can touch inner cells the first
+  /// panel left null, so each subsequent panel must extend the result.
+  void grow_to_cover(Result& result, const Left& left, const Right& right,
+                     const math::GemmHelper& outer_gh) const;
+
+ private:
+  /// Per-output-cell inner ranges implied by one `left`/`right` K-panel.
+  /// Deduced return type: spelling `Result::value_type::range_type` in the
+  /// declaration would make the whole class ill-formed for a non-ToT
+  /// `Result`, but `make_contraction_arena_plan` names this class in its
+  /// return type unconditionally (and returns nullopt for non-ToT).
+  auto operand_inner_ranges(const Left& left, const Right& right,
+                            const math::GemmHelper& outer_gh) const;
+
+  ArenaInnerShapePlan inner_plan_{};
+};
+
+/// True when `T` is a `TA::Tensor` outer whose inner cells the arena
+/// machinery knows how to allocate (legacy `TA::Tensor` ToT inner or the
+/// pinned-view `ArenaTensor`). Doesn't require `is_tensor_of_tensor_v` --
+/// `ArenaTensor` is deliberately not registered as `is_tensor_helper`, so
+/// trait propagation can't reach it that way.
+template <typename T>
+inline constexpr bool is_arena_eligible_outer_v =
+    is_ta_tensor_v<T> &&
+    (is_ta_tensor_v<typename T::value_type> ||
+     ::TiledArray::is_arena_tensor_v<typename T::value_type>);
+
+/// True when `T` is an inner-cell type that the arena machinery treats as
+/// tensor-shaped (as opposed to a scalar in mixed Scale ops). Covers the
+/// legacy `TA::Tensor` inner and the pinned `ArenaTensor`. Used by the
+/// regime-A `accumulate` dispatch to distinguish the tensor-inner branches
+/// from the scalar-inner ones in `scale_left`/`scale_right` cases.
+template <typename T>
+inline constexpr bool is_arena_inner_cell_v =
+    is_ta_tensor_v<T> || ::TiledArray::is_arena_tensor_v<T>;
+
+/// True when the result is an arena-eligible outer; gates the arena
+/// allocation path in cont_engine.
+template <typename Result, typename Left, typename Right>
+inline constexpr bool is_contraction_arena_tot_v =
+    is_arena_eligible_outer_v<Result>;
+
+/// Stores an arena plan for ToT results and std::monostate otherwise.
+template <typename Result, typename Left, typename Right>
+using arena_plan_storage_t =
+    std::conditional_t<is_contraction_arena_tot_v<Result, Left, Right>,
+                       std::optional<ContractionArenaPlan<Result, Left, Right>>,
+                       std::monostate>;
+
+/// Builds a contraction arena plan when the result and inner permutation allow
+/// it.
+template <typename Result, typename Left, typename Right>
+auto make_contraction_arena_plan(ArenaInnerShapeKind inner_kind,
+                                 std::optional<math::GemmHelper> inner_gh,
+                                 const Permutation& inner_perm)
+    -> std::optional<ContractionArenaPlan<Result, Left, Right>> {
+  if (arena_disabled()) return std::nullopt;
+  if constexpr (!is_contraction_arena_tot_v<Result, Left, Right>) {
+    return std::nullopt;
+  } else {
+    if (bool(inner_perm) && !inner_perm.is_identity()) return std::nullopt;
+    if (inner_kind != ArenaInnerShapeKind::gemm_result_range)
+      inner_gh.reset();
+    else if (!inner_gh.has_value())
+      return std::nullopt;
+    return std::optional<ContractionArenaPlan<Result, Left, Right>>(
+        std::in_place, ArenaInnerShapePlan{inner_kind, std::move(inner_gh)});
+  }
+}
+
+/// Per-output-cell inner ranges implied by one `left`/`right` K-panel.
+template <typename Result, typename Left, typename Right>
+auto ContractionArenaPlan<Result, Left, Right>::operand_inner_ranges(
+    const Left& left, const Right& right,
+    const math::GemmHelper& outer_gh) const {
+  using inner_t = typename Result::value_type;
+  using inner_range_t = typename inner_t::range_type;
+  using integer = math::blas::integer;
+
+  integer M, N, K;
+  outer_gh.compute_matrix_sizes(M, N, K, left.range(), right.range());
+  const integer lda = (outer_gh.left_op() == math::blas::NoTranspose) ? K : M;
+  const integer ldb = (outer_gh.right_op() == math::blas::NoTranspose) ? N : K;
+  TA_ASSERT(left.nbatch() == right.nbatch());
+  const std::size_t batch_sz = static_cast<std::size_t>(left.nbatch());
+  const std::size_t mn =
+      static_cast<std::size_t>(M) * static_cast<std::size_t>(N);
+
+  auto range_for = [&](std::size_t ord) -> inner_range_t {
+    if (mn == 0) return inner_range_t{};
+    const integer b = static_cast<integer>(ord / mn);
+    const integer rem = static_cast<integer>(ord % mn);
+    const integer m = rem / N;
+    const integer n = rem % N;
+
+    if (inner_plan_.kind == ArenaInnerShapeKind::left_range) {
+      if constexpr (is_arena_eligible_outer_v<Left>) {
+        const auto* lbase = left.batch_data(static_cast<std::size_t>(b));
+        for (integer k = 0; k != K; ++k) {
+          const auto aoff = (outer_gh.left_op() == math::blas::NoTranspose)
+                                ? m * lda + k
+                                : k * lda + m;
+          const auto& lc = *(lbase + aoff);
+          if (!lc.empty()) return lc.range();
+        }
+      }
+      return inner_range_t{};
+    }
+    if (inner_plan_.kind == ArenaInnerShapeKind::right_range) {
+      if constexpr (is_arena_eligible_outer_v<Right>) {
+        const auto* rbase = right.batch_data(static_cast<std::size_t>(b));
+        for (integer k = 0; k != K; ++k) {
+          const auto boff = (outer_gh.right_op() == math::blas::NoTranspose)
+                                ? k * ldb + n
+                                : n * ldb + k;
+          const auto& rc = *(rbase + boff);
+          if (!rc.empty()) return rc.range();
+        }
+      }
+      return inner_range_t{};
+    }
+    // gemm_result_range needs both operands to be ToT.
+    if constexpr (is_arena_eligible_outer_v<Left> &&
+                  is_arena_eligible_outer_v<Right>) {
+      const auto* lbase = left.batch_data(static_cast<std::size_t>(b));
+      const auto* rbase = right.batch_data(static_cast<std::size_t>(b));
+      for (integer k = 0; k != K; ++k) {
+        const auto aoff = (outer_gh.left_op() == math::blas::NoTranspose)
+                              ? m * lda + k
+                              : k * lda + m;
+        const auto boff = (outer_gh.right_op() == math::blas::NoTranspose)
+                              ? k * ldb + n
+                              : n * ldb + k;
+        const auto& lc = *(lbase + aoff);
+        const auto& rc = *(rbase + boff);
+        if (lc.empty() || rc.empty()) continue;
+        return inner_plan_.template make<inner_range_t>(lc, rc);
+      }
+    }
+    return inner_range_t{};
+  };
+
+  std::vector<inner_range_t> ranges;
+  const std::size_t N_cells = mn * batch_sz;
+  ranges.reserve(N_cells);
+  for (std::size_t ord = 0; ord < N_cells; ++ord)
+    ranges.emplace_back(range_for(ord));
+  return ranges;
+}
+
+/// Reserves arena storage and constructs the result tensor-of-tensor tile.
+template <typename Result, typename Left, typename Right>
+Result ContractionArenaPlan<Result, Left, Right>::reserve_and_construct(
+    const Left& left, const Right& right,
+    const math::GemmHelper& outer_gh) const {
+  using inner_range_t = typename Result::value_type::range_type;
+  auto outer_range =
+      outer_gh.template make_result_range<typename Result::range_type>(
+          left.range(), right.range());
+  TA_ASSERT(left.nbatch() == right.nbatch());
+  const std::size_t batch_sz = static_cast<std::size_t>(left.nbatch());
+  const auto ranges = operand_inner_ranges(left, right, outer_gh);
+  // arena_outer_init dispatches internally on the inner-cell type.
+  return detail::arena_outer_init<Result>(
+      outer_range, batch_sz,
+      [&ranges](std::size_t ord) -> inner_range_t { return ranges[ord]; });
+}
+
+/// Grows an already-constructed result tile to cover this K-panel's cells.
+template <typename Result, typename Left, typename Right>
+void ContractionArenaPlan<Result, Left, Right>::grow_to_cover(
+    Result& result, const Left& left, const Right& right,
+    const math::GemmHelper& outer_gh) const {
+  using inner_range_t = typename Result::value_type::range_type;
+  const auto ranges = operand_inner_ranges(left, right, outer_gh);
+  detail::arena_tot_grow_inplace(
+      result,
+      [&ranges](std::size_t ord) -> inner_range_t { return ranges[ord]; });
+}
+
+/// Accumulates a contraction into an already-allocated result cell.
+template <typename Result, typename Left, typename Right, typename Scalar>
+void fused_contraction_inplace(Result& result, const Left& left,
+                               const Right& right, Scalar alpha,
+                               const math::GemmHelper& gh) {
+  if (left.empty() || right.empty()) return;
+  TA_ASSERT(!result.empty());
+  // Free `gemm` CPO, not the member: `ArenaTensor` (a view) provides only the
+  // free in-place overload, while `TA::Tensor` is reached via the
+  // `tile_interface.h` CPO that forwards to its member.
+  gemm(result, left, right, alpha, gh);
+}
+
+/// Accumulates an elementwise product into an already-allocated result cell.
+template <typename Result, typename Left, typename Right>
+void fused_hadamard_inplace(Result& result, const Left& left,
+                            const Right& right) {
+  if (left.empty() || right.empty()) return;
+  TA_ASSERT(!result.empty());
+  inplace_tensor_op(
+      [](typename Result::value_type& MADNESS_RESTRICT r,
+         const typename Left::value_type& MADNESS_RESTRICT l,
+         const typename Right::value_type& MADNESS_RESTRICT rr) {
+        r += l * rr;
+      },
+      result, left, right);
+}
+
+/// Accumulates a scaled elementwise product into an allocated result cell.
+template <typename Result, typename Left, typename Right, typename Scalar>
+void fused_hadamard_scaled_inplace(Result& result, const Left& left,
+                                   const Right& right, Scalar factor) {
+  if (left.empty() || right.empty()) return;
+  TA_ASSERT(!result.empty());
+  // Preserve historical grouping: r += (l * rr) * factor.
+  inplace_tensor_op(
+      [factor](typename Result::value_type& MADNESS_RESTRICT r,
+               const typename Left::value_type& MADNESS_RESTRICT l,
+               const typename Right::value_type& MADNESS_RESTRICT rr) {
+        r += (l * rr) * factor;
+      },
+      result, left, right);
+}
+
+/// Accumulates a ToT cell scaled by a scalar right operand.
+template <typename Result, typename Left, typename Scalar>
+void fused_scale_tot_x_t_inplace(Result& result, const Left& left,
+                                 const Scalar& s) {
+  if (left.empty()) return;
+  TA_ASSERT(!result.empty());
+  inplace_tensor_op(
+      [s](typename Result::value_type& MADNESS_RESTRICT r,
+          const typename Left::value_type& MADNESS_RESTRICT l) { r += l * s; },
+      result, left);
+}
+
+/// Accumulates a ToT right operand scaled by a scalar left operand.
+template <typename Result, typename Scalar, typename Right>
+void fused_scale_t_x_tot_inplace(Result& result, const Scalar& s,
+                                 const Right& right) {
+  if (right.empty()) return;
+  TA_ASSERT(!result.empty());
+  inplace_tensor_op(
+      [s](typename Result::value_type& MADNESS_RESTRICT r,
+          const typename Right::value_type& MADNESS_RESTRICT rr) {
+        r += rr * s;
+      },
+      result, right);
+}
+
+/// Creates a fused contraction callback.
+template <typename Result, typename Left, typename Right, typename Op>
+auto make_fused_contraction_lambda(Op contrreduce_op) {
+  return
+      [contrreduce_op](Result& result, const Left& left, const Right& right) {
+        TA_ASSERT(!contrreduce_op.perm());
+        fused_contraction_inplace(result, left, right, contrreduce_op.factor(),
+                                  contrreduce_op.gemm_helper());
+      };
+}
+
+/// Hadamard-outer, contraction-inner ToT x ToT product into a fresh arena
+/// tile. `left` and `right` share the (Hadamard) outer layout; each result
+/// outer cell is the inner GEMM of the corresponding left/right inner cells,
+/// shaped by `inner_gh`. `cell_op(result_cell, left_cell, right_cell)` runs
+/// the per-cell in-place contraction (e.g. the make_fused_contraction_lambda
+/// callback). The per-cell op is perm-free; a non-identity `inner_perm`
+/// permutes the result cells' inner modes as a slab-level post-pass.
+template <typename Result, typename Left, typename Right, typename CellOp>
+Result arena_hadamard_inner_contract(const Left& left, const Right& right,
+                                     const math::GemmHelper& inner_gh,
+                                     const CellOp& cell_op,
+                                     const Permutation& inner_perm) {
+  using inner_range_t = typename Result::value_type::range_type;
+  TA_ASSERT(left.range().volume() == right.range().volume());
+  TA_ASSERT(left.nbatch() == right.nbatch());
+  const std::size_t N_cells = left.range().volume() * left.nbatch();
+  auto range_fn = [&left, &right, &inner_gh](std::size_t ord) -> inner_range_t {
+    const auto& lc = left.data()[ord];
+    const auto& rc = right.data()[ord];
+    if (lc.empty() || rc.empty()) return inner_range_t{};
+    return inner_gh.template make_result_range<inner_range_t>(lc.range(),
+                                                              rc.range());
+  };
+  Result result =
+      arena_outer_init<Result>(left.range(), left.nbatch(), range_fn);
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    if (result.data()[ord].empty()) continue;
+    cell_op(result.data()[ord], left.data()[ord], right.data()[ord]);
+  }
+  if (inner_perm && !inner_perm.is_identity())
+    result = arena_inner_permute<Result>(result, inner_perm);
+  return result;
+}
+
+/// Creates a fused Hadamard callback.
+template <typename Result, typename Left, typename Right>
+auto make_fused_hadamard_lambda() {
+  return [](Result& result, const Left& left, const Right& right) {
+    fused_hadamard_inplace(result, left, right);
+  };
+}
+
+/// Creates a fused scaled-Hadamard callback.
+template <typename Result, typename Left, typename Right, typename Scalar>
+auto make_fused_hadamard_scaled_lambda(Scalar factor) {
+  return [factor](Result& result, const Left& left, const Right& right) {
+    fused_hadamard_scaled_inplace(result, left, right, factor);
+  };
+}
+
+/// Creates a fused ToT-times-scalar callback.
+template <typename Result, typename Left, typename Right>
+auto make_fused_scale_tot_x_t_lambda() {
+  return [](Result& result, const Left& left, const Right& right) {
+    fused_scale_tot_x_t_inplace(result, left, right);
+  };
+}
+
+/// Creates a fused scalar-times-ToT callback.
+template <typename Result, typename Left, typename Right>
+auto make_fused_scale_t_x_tot_lambda() {
+  return [](Result& result, const Left& left, const Right& right) {
+    fused_scale_t_x_tot_inplace(result, left, right);
+  };
+}
+
+/// Discriminates the per-cell operation used by the arena regime-A path.
+enum class RegimeAInnerKind {
+  hadamard,
+  contraction,
+  scale_left,  // ToT × plain T → ToT (right operand contributes scalars)
+  scale_right  // plain T × ToT → ToT (left operand contributes scalars)
+};
+
+/// Permute the extents of `src` by `perm` and materialize a range of type
+/// `RangeT`. Generic over the inner-cell range types regime-A einsum sees:
+/// `TA::Range` (legacy `Tensor<Tensor>` inners) and `btas::zb::RangeNd`
+/// (`Tensor<ArenaTensor>` inners). `Permutation * Range` only exists for
+/// `TA::Range`, so the permutation is applied to a plain extent vector and
+/// the target range is rebuilt from the result.
+template <typename RangeT, typename SrcRange>
+RangeT arena_make_permuted_range(const TiledArray::Permutation& perm,
+                                 const SrcRange& src) {
+  const std::size_t rank = src.rank();
+  const auto& src_ext = src.extent();
+  container::svector<std::size_t> ext(rank);
+  for (std::size_t d = 0; d < rank; ++d)
+    ext[d] = static_cast<std::size_t>(src_ext[d]);
+  if (perm && !perm.is_identity()) {
+    TA_ASSERT(perm.size() == rank);
+    return RangeT(perm * ext);
+  }
+  return RangeT(ext);
+}
+
+/// Holds the inner operation plan for arena regime-A dispatch.
+template <typename Result, typename A, typename B, typename Inner>
+struct RegimeAArenaPlan {
+  using Annot = ::Einsum::Index<std::string>;
+
+  bool active = false;
+  RegimeAInnerKind kind = RegimeAInnerKind::hadamard;
+
+  // Exactly one plan optional is engaged; optionals avoid default construction.
+  std::optional<TensorHadamardPlan<Annot>> h_plan{};
+  std::optional<TensorContractionPlan<Annot>> c_plan{};
+
+  /// Derives the result inner range from a non-empty input-cell pair.
+  template <typename InnerRange, typename LRange, typename RRange>
+  InnerRange derive_inner_range(const LRange& l_range,
+                                const RRange& r_range) const {
+    switch (kind) {
+      case RegimeAInnerKind::hadamard:
+        TA_ASSERT(h_plan.has_value());
+        return arena_make_permuted_range<InnerRange>(h_plan->perm.AC, l_range);
+      case RegimeAInnerKind::contraction: {
+        TA_ASSERT(c_plan.has_value());
+        const auto& p = *c_plan;
+        using PlanIndices = std::remove_cvref_t<decltype(p.A)>;
+        using PlanIndex = typename PlanIndices::value_type;
+        using Extent =
+            std::remove_cv_t<typename decltype(std::declval<TiledArray::Range>()
+                                                   .extent())::value_type>;
+        using ExtentMap = ::Einsum::index::IndexMap<PlanIndex, Extent>;
+        ExtentMap extent = (ExtentMap{p.A, l_range.extent()} |
+                            ExtentMap{p.B, r_range.extent()});
+        container::vector<Extent> rng;
+        rng.reserve(p.e.size());
+        for (auto&& ix : p.e) rng.emplace_back(extent[ix]);
+        return InnerRange(rng);
+      }
+      case RegimeAInnerKind::scale_left:
+        // Scale-left preserves the ToT operand's inner range.
+        return InnerRange(l_range);
+      case RegimeAInnerKind::scale_right:
+        return InnerRange(r_range);
+    }
+    TA_ASSERT(false && "RegimeAInnerKind: unhandled kind");
+    return InnerRange{};
+  }
+
+  /// Accumulates one input-cell pair into the result cell.
+  template <typename ResultCell, typename LCell, typename RCell>
+  void accumulate(ResultCell& r, const LCell& l, const RCell& rr) const {
+    switch (kind) {
+      case RegimeAInnerKind::hadamard: {
+        if constexpr (is_arena_inner_cell_v<LCell> &&
+                      is_arena_inner_cell_v<RCell>) {
+          if (l.empty() || rr.empty()) return;
+          TA_ASSERT(h_plan.has_value());
+          // run_regime_a_arena has already hoisted any operand inner
+          // permutation, so l and rr are both in C-layout: the per-cell op
+          // is a flat r += l * rr on congruent cells.
+          fused_hadamard_inplace(r, l, rr);
+        }
+        return;
+      }
+      case RegimeAInnerKind::contraction: {
+        if constexpr (is_arena_inner_cell_v<LCell> &&
+                      is_arena_inner_cell_v<RCell>) {
+          if (l.empty() || rr.empty()) return;
+          TA_ASSERT(c_plan.has_value());
+          // run_regime_a_arena has already hoisted any operand inner
+          // permutation, so l and rr are in canonical (blas_layout) order:
+          // the per-cell op is a single canonical GEMM into r with beta=1.
+          // Uniform for TA::Tensor and ArenaTensor cells (free `gemm` CPO).
+          using Scalar = typename std::remove_cv_t<ResultCell>::numeric_type;
+          fused_contraction_inplace(r, l, rr, Scalar{1}, c_plan->gemm_helper);
+        }
+        return;
+      }
+      case RegimeAInnerKind::scale_left: {
+        // Scale-left receives a ToT inner cell and a scalar.
+        if constexpr (is_arena_inner_cell_v<LCell> &&
+                      !is_arena_inner_cell_v<RCell>) {
+          if (l.empty()) return;
+          fused_scale_tot_x_t_inplace(r, l, rr);
+        }
+        return;
+      }
+      case RegimeAInnerKind::scale_right: {
+        if constexpr (!is_arena_inner_cell_v<LCell> &&
+                      is_arena_inner_cell_v<RCell>) {
+          if (rr.empty()) return;
+          fused_scale_t_x_tot_inplace(r, l, rr);
+        }
+        return;
+      }
+    }
+  }
+};
+
+/// Builds an arena regime-A plan when result and permutation constraints allow
+/// it.
+template <typename Result, typename A, typename B, typename Inner,
+          typename PermT>
+auto make_regime_a_arena_plan(const A& a, const B& b, const Inner& inner,
+                              const PermT& inner_perm)
+    -> RegimeAArenaPlan<Result, A, B, Inner> {
+  using Plan = RegimeAArenaPlan<Result, A, B, Inner>;
+  Plan plan;
+  if (arena_disabled()) return plan;
+  if constexpr (!is_arena_eligible_outer_v<Result>) {
+    return plan;
+  } else {
+    // `inner_perm` (== C.permutation at the call site) is the result *outer*
+    // permutation. run_regime_a_arena applies it itself via tile.permute(pc)
+    // -- byte-identical to the legacy non-arena path, and supported for an
+    // arena ToT via arena_permute_shallow -- so it does not gate the plan.
+    // Inner-operand and inner-result permutations are likewise handled, by
+    // hoisting them to slab-level arena_inner_permute rewrites (see below).
+    (void)inner_perm;
+
+    using ArrayA_t = std::remove_cvref_t<decltype(a.array)>;
+    using ArrayB_t = std::remove_cvref_t<decltype(b.array)>;
+    // "Tot" here means "tile is a ToT-like thing whose inner cell is the
+    // tensor we want to operate on"; covers both legacy TA::Tensor inners
+    // and pinned ArenaTensor inners.
+    constexpr bool a_is_tot =
+        is_arena_eligible_outer_v<typename ArrayA_t::value_type>;
+    constexpr bool b_is_tot =
+        is_arena_eligible_outer_v<typename ArrayB_t::value_type>;
+
+    if constexpr (a_is_tot && b_is_tot) {
+      if (static_cast<bool>(inner.h)) {
+        plan.kind = RegimeAInnerKind::hadamard;
+        plan.h_plan.emplace(inner.A, inner.B, inner.C);
+        // A non-canonical inner Hadamard (h_plan.perm.{AC,BC} non-identity)
+        // is handled the same way as a non-canonical inner contraction:
+        // run_regime_a_arena hoists each operand inner permutation to a
+        // slab-level rewrite (arena_inner_permute) so both operands reach
+        // C-layout before the per-cell flat r += l * rr. No need to bail.
+      } else {
+        plan.kind = RegimeAInnerKind::contraction;
+        plan.c_plan.emplace(inner.A, inner.B, inner.C);
+        // A non-canonical inner contraction (c_plan.do_perm.{A,B,C} set --
+        // e.g. M/K- or M/N-interleaved inner annotations that are not
+        // GEMM-absorbable transposes) is still handled: run_regime_a_arena
+        // hoists each operand inner permutation, and the result inner
+        // permutation, to slab-level rewrites (arena_inner_permute), leaving
+        // the per-cell op a single canonical GEMM. No need to bail here.
+      }
+    } else if constexpr (a_is_tot && !b_is_tot) {
+      plan.kind = RegimeAInnerKind::scale_left;
+    } else if constexpr (!a_is_tot && b_is_tot) {
+      plan.kind = RegimeAInnerKind::scale_right;
+    } else {
+      return plan;
+    }
+    plan.active = true;
+    (void)a;
+    (void)b;
+    return plan;
+  }
+}
+
+/// Runs the arena regime-A path for one H-slice when the plan is active.
+template <typename Plan, typename HIndex, typename TermA, typename TermB,
+          typename TermC, typename LocalTiles, typename Tiles, typename Trange>
+bool run_regime_a_arena(const Plan& plan, const HIndex& h, std::size_t batch,
+                        const TermA& A, const TermB& B, const TermC& C,
+                        LocalTiles& C_local_tiles, const Tiles& tiles,
+                        const Trange& trange) {
+  if (!plan.active) return false;
+
+  using ResultTensor = typename LocalTiles::value_type::second_type;
+  // Guard avoids naming inner-cell APIs for non-ToT instantiations.
+  using ArrayA_t = std::remove_cvref_t<decltype(A.array)>;
+  using ArrayB_t = std::remove_cvref_t<decltype(B.array)>;
+  // ToT-like in the regime-A sense: tile is an arena-eligible outer
+  // (legacy TA::Tensor inner or pinned ArenaTensor inner).
+  constexpr bool a_is_tot =
+      is_arena_eligible_outer_v<typename ArrayA_t::value_type>;
+  constexpr bool b_is_tot =
+      is_arena_eligible_outer_v<typename ArrayB_t::value_type>;
+  if constexpr (!is_arena_eligible_outer_v<ResultTensor> ||
+                (!a_is_tot && !b_is_tot)) {
+    (void)h;
+    (void)batch;
+    (void)A;
+    (void)B;
+    (void)C;
+    (void)C_local_tiles;
+    (void)tiles;
+    (void)trange;
+    return false;
+  } else {
+    using InnerT = typename ResultTensor::value_type;
+    using InnerRange = typename InnerT::range_type;
+
+    const auto& pa = A.permutation;
+    const auto& pb = B.permutation;
+    const auto& pc = C.permutation;
+    auto const c = apply(pc, h);
+
+    if constexpr (a_is_tot && b_is_tot) {
+      using IIndex = ::Einsum::index::Index<std::size_t>;
+      auto range_for = [&](std::size_t k) -> InnerRange {
+        if (k >= batch) return InnerRange{};
+        for (IIndex i : tiles) {
+          const auto pahi_inv = apply_inverse(pa, h + i);
+          const auto pbhi_inv = apply_inverse(pb, h + i);
+          if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue;
+          auto ai = A.array.find(pahi_inv).get();
+          auto bi = B.array.find(pbhi_inv).get();
+          if (pa) ai = ai.permute(pa);
+          if (pb) bi = bi.permute(pb);
+          auto shape = trange.tile(i);
+          ai = ai.reshape(shape, batch);
+          bi = bi.reshape(shape, batch);
+          auto aik = ai.batch(k);
+          auto bik = bi.batch(k);
+          auto vol = aik.total_size();
+          TA_ASSERT(vol == bik.total_size());
+          for (decltype(vol) j = 0; j < vol; ++j) {
+            const auto& l_inner = aik.data()[j];
+            const auto& r_inner = bik.data()[j];
+            if (l_inner.empty() || r_inner.empty()) continue;
+            return plan.template derive_inner_range<InnerRange>(
+                l_inner.range(), r_inner.range());
+          }
+        }
+        return InnerRange{};
+      };
+
+      ResultTensor tile = arena_outer_init<ResultTensor>(
+          TiledArray::Range{batch}, /*batch_sz=*/1, range_for);
+
+      for (IIndex i : tiles) {
+        const auto pahi_inv = apply_inverse(pa, h + i);
+        const auto pbhi_inv = apply_inverse(pb, h + i);
+        if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue;
+        auto ai = A.array.find(pahi_inv).get();
+        auto bi = B.array.find(pbhi_inv).get();
+        if (pa) ai = ai.permute(pa);
+        if (pb) bi = bi.permute(pb);
+        // Hoist a non-canonical inner op's operand inner permutations to
+        // slab-level rewrites, so the per-cell op below stays canonical:
+        // contraction -> a single canonical GEMM; Hadamard -> a flat
+        // r += l * rr on congruent C-layout cells. No per-cell view permute.
+        if (plan.kind == RegimeAInnerKind::contraction) {
+          const auto& cp = *plan.c_plan;
+          if (cp.do_perm.A)
+            ai = arena_inner_permute<decltype(ai)>(ai, cp.perm.A);
+          if (cp.do_perm.B)
+            bi = arena_inner_permute<decltype(bi)>(bi, cp.perm.B);
+        } else if (plan.kind == RegimeAInnerKind::hadamard) {
+          const auto& hp = *plan.h_plan;
+          if (!hp.perm.AC.is_identity())
+            ai = arena_inner_permute<decltype(ai)>(ai, hp.perm.AC);
+          if (!hp.perm.BC.is_identity())
+            bi = arena_inner_permute<decltype(bi)>(bi, hp.perm.BC);
+        }
+        auto shape = trange.tile(i);
+        ai = ai.reshape(shape, batch);
+        bi = bi.reshape(shape, batch);
+        for (std::size_t k = 0; k < batch; ++k) {
+          auto& cell = tile({k});
+          if (cell.empty()) continue;
+          auto aik = ai.batch(k);
+          auto bik = bi.batch(k);
+          auto vol = aik.total_size();
+          TA_ASSERT(vol == bik.total_size());
+          for (decltype(vol) j = 0; j < vol; ++j) {
+            const auto& l_inner = aik.data()[j];
+            const auto& r_inner = bik.data()[j];
+            plan.accumulate(cell, l_inner, r_inner);
+          }
+        }
+      }
+
+      // Hoist the result inner permutation: cells were accumulated in
+      // blas_layout (e) order; rewrite the slab to the C inner order.
+      if (plan.kind == RegimeAInnerKind::contraction && plan.c_plan->do_perm.C)
+        tile =
+            arena_inner_permute<ResultTensor>(tile, plan.c_plan->perm.C.inv());
+      auto shape = apply_inverse(pc, C.array.trange().tile(c));
+      tile = tile.reshape(shape);
+      if (pc) tile = tile.permute(pc);
+      C_local_tiles.emplace_back(std::move(c), std::move(tile));
+      return true;
+    } else {
+      // Scale path has exactly one ToT operand and one scalar-cell operand.
+      using IIndex = ::Einsum::index::Index<std::size_t>;
+      auto range_for = [&](std::size_t k) -> InnerRange {
+        if (k >= batch) return InnerRange{};
+        for (IIndex i : tiles) {
+          const auto pahi_inv = apply_inverse(pa, h + i);
+          const auto pbhi_inv = apply_inverse(pb, h + i);
+          if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue;
+          auto ai = A.array.find(pahi_inv).get();
+          auto bi = B.array.find(pbhi_inv).get();
+          if (pa) ai = ai.permute(pa);
+          if (pb) bi = bi.permute(pb);
+          auto shape = trange.tile(i);
+          ai = ai.reshape(shape, batch);
+          bi = bi.reshape(shape, batch);
+          auto aik = ai.batch(k);
+          auto bik = bi.batch(k);
+          if constexpr (a_is_tot) {
+            auto vol = aik.total_size();
+            for (decltype(vol) j = 0; j < vol; ++j) {
+              const auto& l_inner = aik.data()[j];
+              if (l_inner.empty()) continue;
+              return InnerRange(l_inner.range());
+            }
+          } else {
+            auto vol = bik.total_size();
+            for (decltype(vol) j = 0; j < vol; ++j) {
+              const auto& r_inner = bik.data()[j];
+              if (r_inner.empty()) continue;
+              return InnerRange(r_inner.range());
+            }
+          }
+        }
+        return InnerRange{};
+      };
+
+      ResultTensor tile = arena_outer_init<ResultTensor>(
+          TiledArray::Range{batch}, /*batch_sz=*/1, range_for);
+
+      for (IIndex i : tiles) {
+        const auto pahi_inv = apply_inverse(pa, h + i);
+        const auto pbhi_inv = apply_inverse(pb, h + i);
+        if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue;
+        auto ai = A.array.find(pahi_inv).get();
+        auto bi = B.array.find(pbhi_inv).get();
+        if (pa) ai = ai.permute(pa);
+        if (pb) bi = bi.permute(pb);
+        auto shape = trange.tile(i);
+        ai = ai.reshape(shape, batch);
+        bi = bi.reshape(shape, batch);
+        for (std::size_t k = 0; k < batch; ++k) {
+          auto& cell = tile({k});
+          if (cell.empty()) continue;
+          auto aik = ai.batch(k);
+          auto bik = bi.batch(k);
+          auto vol = aik.total_size();
+          TA_ASSERT(vol == bik.total_size());
+          for (decltype(vol) j = 0; j < vol; ++j) {
+            const auto& l_elem = aik.data()[j];
+            const auto& r_elem = bik.data()[j];
+            plan.accumulate(cell, l_elem, r_elem);
+          }
+        }
+      }
+
+      auto shape = apply_inverse(pc, C.array.trange().tile(c));
+      tile = tile.reshape(shape);
+      if (pc) tile = tile.permute(pc);
+      C_local_tiles.emplace_back(std::move(c), std::move(tile));
+      return true;
+    }
+  }
+}
+
+}  // namespace TiledArray::detail
+
+#endif  // TILEDARRAY_TENSOR_ARENA_EINSUM_H__INCLUDED
diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h
new file mode 100644
index 0000000000..8dcd97c870
--- /dev/null
+++ b/src/TiledArray/tensor/arena_kernels.h
@@ -0,0 +1,442 @@
+/// Arena kernels for tensor-of-tensor (ToT) outer tiles.
+///
+/// One slab-backed builder family, dispatching on the inner-tile type:
+///   - `ArenaTensor` inners  -> slab of `Cell`s (range header + element data),
+///                              each inner cell is an 8-byte view;
+///   - `TA::Tensor` inners   -> slab of element data, each inner `Tensor`
+///                              aliases its slice of the slab.
+/// `is_arena_tensor_v<Inner>` selects the per-cell layout; everything else
+/// (planning, allocation, outer-tile assembly) is shared.
+
+#ifndef TILEDARRAY_TENSOR_ARENA_KERNELS_H__INCLUDED
+#define TILEDARRAY_TENSOR_ARENA_KERNELS_H__INCLUDED
+
+#include "TiledArray/config.h"
+#include "TiledArray/error.h"
+#include "TiledArray/tensor/arena.h"
+#include "TiledArray/tensor/arena_tensor.h"
+
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace TiledArray {
+namespace detail {
+
+namespace {
+
+/// Build outer storage whose deleter owns arena and alias keep-alive state.
+template <typename OuterTensor, typename KeepAlive>
+std::shared_ptr<typename OuterTensor::value_type[]> make_outer_data(
+    std::size_t n_cells, std::shared_ptr<Arena> arena_handle,
+    KeepAlive keep_alive) {
+  using inner_t = typename OuterTensor::value_type;
+  std::allocator<inner_t> allocator;
+  inner_t* raw = allocator.allocate(n_cells);
+  auto deleter =
+      [allocator = std::move(allocator), arena_handle = std::move(arena_handle),
+       keep_alive = std::move(keep_alive), n_cells](inner_t* p) mutable {
+        for (std::size_t i = 0; i < n_cells; ++i) (p + i)->~inner_t();
+        allocator.deallocate(p, n_cells);
+        (void)arena_handle;
+        (void)keep_alive;
+      };
+  return std::shared_ptr<inner_t[]>(raw, std::move(deleter));
+}
+
+}  // namespace
+
+/// Allocate a slab-backed ToT outer tile with caller-provided inner ranges.
+///
+/// `inner_range_fn(cell_ordinal)` -> inner `range_type` for each cell ordinal
+/// in `[0, outer_range.volume() * batch_sz)`; a zero-volume range yields a
+/// deliberately-null inner cell that consumes no slab bytes. Element storage
+/// is left zero-initialized when `zero_init` is true. `cell_stride_align` is
+/// the minimum byte stride between adjacent cells; it is bumped up to the
+/// inner type's natural alignment (`ArenaTensor::cell_alignment()`, or
+/// `alignof(T)` for `TA::Tensor` inners).
+template <typename OuterTensor, typename InnerRangeFn>
+OuterTensor arena_outer_init(
+    const typename OuterTensor::range_type& outer_range, std::size_t batch_sz,
+    InnerRangeFn&& inner_range_fn,
+    std::size_t cell_stride_align = kArenaCachelineAlign,
+    bool zero_init = true) {
+  using InnerT = typename OuterTensor::value_type;
+  using T = typename InnerT::value_type;
+  using InnerRange = typename InnerT::range_type;
+  constexpr bool arena = is_arena_tensor_v<InnerT>;
+
+  std::size_t stride = cell_stride_align;
+  if constexpr (arena) {
+    if (InnerT::cell_alignment() > stride) stride = InnerT::cell_alignment();
+  } else {
+    if (alignof(T) > stride) stride = alignof(T);
+  }
+  // Cells pack at `stride` granularity, but the slab base handed to
+  // `Arena::reserve` must be at least `max_align_t`-aligned.
+  const std::size_t slab_align =
+      stride > alignof(std::max_align_t) ? stride : alignof(std::max_align_t);
+
+  const std::size_t N_cells = outer_range.volume() * batch_sz;
+  constexpr std::size_t kNull = static_cast<std::size_t>(-1);
+  std::vector<InnerRange> ranges;
+  ranges.reserve(N_cells);
+  std::vector<std::size_t> offsets(N_cells, 0);
+  std::size_t total = 0;
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    ranges.emplace_back(inner_range_fn(ord));
+    const std::size_t vol = ranges.back().volume();
+    if (vol == 0) {
+      offsets[ord] = kNull;
+    } else {
+      offsets[ord] = total;
+      // `if constexpr`, not a ternary: `InnerT::cell_size` does not exist for
+      // a `TA::Tensor` inner, so the non-arena branch must not be formed.
+      std::size_t bytes;
+      if constexpr (arena)
+        bytes = InnerT::cell_size(vol);
+      else
+        bytes = vol * sizeof(T);
+      total += arena_align_up(bytes, stride);
+    }
+  }
+
+  auto arena_slab = std::make_shared<Arena>();
+  if (total > 0) arena_slab->reserve(total, zero_init, slab_align);
+  auto data = make_outer_data<OuterTensor>(N_cells, arena_slab,
+                                           std::shared_ptr<InnerT[]>{});
+  OuterTensor result(outer_range, batch_sz, std::move(data));
+
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    auto& r = ranges[ord];
+    if (offsets[ord] == kNull) {
+      if constexpr (arena) {
+        ::new (result.data() + ord) InnerT();
+      } else {
+        // Rank-0 empties must preserve Tensor's null-data/no-range invariant.
+        if (r.rank() == 0)
+          ::new (result.data() + ord) InnerT();
+        else
+          ::new (result.data() + ord) InnerT(r);
+      }
+    } else if constexpr (arena) {
+      // slice<std::byte>(offset, 1) returns an aliased shared_ptr; we only
+      // need its raw pointer to placement-new the Cell -- the slab's lifetime
+      // is held by `arena_handle` captured in the outer's deleter.
+      auto byte_view = arena_slab->template slice<std::byte>(offsets[ord], 1);
+      ::new (result.data() + ord)
+          InnerT(make_arena_tensor_in<T>(byte_view.get(), std::move(r)));
+    } else {
+      auto elem_data = arena_slab->template slice<T>(offsets[ord], r.volume());
+      ::new (result.data() + ord) InnerT(r, std::move(elem_data));
+    }
+  }
+  return result;
+}
+
+/// Default (no-op) fill for `make_nested_tile` -- leaves element storage
+/// zero-initialized.
+struct nested_fill_noop {
+  template <typename Cell, typename Index>
+  void operator()(Cell&, const Index&) const noexcept {}
+};
+
+/// Build one ToT outer tile over `outer_range`, two-pass:
+///   pass 1: `inner_range_fn(outer_element_index)` -> inner `range_type`
+///           sizes every inner cell (zero-volume -> deliberately-null cell);
+///   pass 2: `inner_fill_fn(inner_cell&, outer_element_index)` fills each
+///           non-null cell. The default fill leaves storage zero-initialized.
+/// Dispatches internally on the inner-tile type (see `arena_outer_init`).
+template <typename OuterTensor, typename InnerRangeFn,
+          typename InnerFillFn = nested_fill_noop>
+OuterTensor make_nested_tile(
+    const typename OuterTensor::range_type& outer_range,
+    InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) {
+  // arena_outer_init keys ranges on the cell ordinal; user code keys on the
+  // (global) outer element index -- translate via the outer range.
+  auto cell_range_fn = [&](std::size_t ord) {
+    return inner_range_fn(outer_range.idx(ord));
+  };
+  OuterTensor result =
+      arena_outer_init<OuterTensor>(outer_range, 1, cell_range_fn);
+  const std::size_t N = outer_range.volume();
+  for (std::size_t ord = 0; ord < N; ++ord) {
+    auto& cell = result.data()[ord];
+    if (!cell.empty()) inner_fill_fn(cell, outer_range.idx(ord));
+  }
+  return result;
+}
+
+/// Apply a unary fill op while preserving each source inner range.
+/// `fill_op(dst_data, src_data, n_elements)` writes the result cell.
+template <typename OuterTensor, typename SrcOuterTensor, typename FillOp>
+OuterTensor arena_trivial_unary(const SrcOuterTensor& src, FillOp&& fill_op) {
+  using elem_t = typename OuterTensor::value_type::value_type;
+  using inner_range_t = typename OuterTensor::value_type::range_type;
+  // A null inner cell has no range to query (`ArenaTensor::range()` asserts
+  // non-null); map it to a default range -> a null result cell.
+  auto range_fn = [&src](std::size_t ord) -> inner_range_t {
+    const auto& s = src.data()[ord];
+    return s.empty() ? inner_range_t{} : s.range();
+  };
+  // Elementwise kernels pack tight (no cross-cell GEMM to amortize padding);
+  // the fill overwrites every element, so the slab need not be zero-init'd.
+  OuterTensor result = arena_outer_init<OuterTensor>(src.range(), src.nbatch(),
+                                                     range_fn, alignof(elem_t),
+                                                     /*zero_init=*/false);
+  const std::size_t N_cells = src.range().volume() * src.nbatch();
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    auto& dst = result.data()[ord];
+    if (dst.empty()) continue;
+    fill_op(dst.data(), src.data()[ord].data(), dst.size());
+  }
+  return result;
+}
+
+/// Apply a binary fill op using the left operand's inner ranges (asserted
+/// equal to the right's per cell). `fill_op(dst, l, r, n_elements)`.
+template <typename OuterTensor, typename LeftTensor, typename RightTensor,
+          typename FillOp>
+OuterTensor arena_trivial_binary(const LeftTensor& left,
+                                 const RightTensor& right, FillOp&& fill_op) {
+  using elem_t = typename OuterTensor::value_type::value_type;
+  using inner_range_t = typename OuterTensor::value_type::range_type;
+  TA_ASSERT(left.range().volume() == right.range().volume());
+  TA_ASSERT(left.nbatch() == right.nbatch());
+  auto range_fn = [&left](std::size_t ord) -> inner_range_t {
+    const auto& l = left.data()[ord];
+    return l.empty() ? inner_range_t{} : l.range();
+  };
+  OuterTensor result = arena_outer_init<OuterTensor>(
+      left.range(), left.nbatch(), range_fn, alignof(elem_t),
+      /*zero_init=*/false);
+  const std::size_t N_cells = left.range().volume() * left.nbatch();
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    auto& dst = result.data()[ord];
+    if (dst.empty()) continue;
+    TA_ASSERT(left.data()[ord].size() == right.data()[ord].size());
+    TA_ASSERT(left.data()[ord].size() == dst.size());
+    fill_op(dst.data(), left.data()[ord].data(), right.data()[ord].data(),
+            dst.size());
+  }
+  return result;
+}
+
+/// Trivial mixed scalar/ToT outer-Hadamard kernel: `tot_outer` drives the
+/// result's outer and per-cell inner ranges; `scalar_outer` supplies one
+/// scalar per outer cell. `fill_op(dst, tot_data, scalar_value, n_elements)`.
+template <typename OuterTensor, typename ToTSide, typename ScalarSide,
+          typename FillOp>
+OuterTensor arena_trivial_scaled(const ToTSide& tot_outer,
+                                 const ScalarSide& scalar_outer,
+                                 FillOp&& fill_op) {
+  using elem_t = typename OuterTensor::value_type::value_type;
+  using inner_range_t = typename OuterTensor::value_type::range_type;
+  TA_ASSERT(tot_outer.range().volume() == scalar_outer.range().volume());
+  TA_ASSERT(tot_outer.nbatch() == scalar_outer.nbatch());
+  auto range_fn = [&tot_outer](std::size_t ord) -> inner_range_t {
+    const auto& t = tot_outer.data()[ord];
+    return t.empty() ? inner_range_t{} : t.range();
+  };
+  OuterTensor result = arena_outer_init<OuterTensor>(
+      tot_outer.range(), tot_outer.nbatch(), range_fn, alignof(elem_t),
+      /*zero_init=*/false);
+  const std::size_t N_cells = tot_outer.range().volume() * tot_outer.nbatch();
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    auto& dst = result.data()[ord];
+    if (dst.empty()) continue;
+    fill_op(dst.data(), tot_outer.data()[ord].data(), scalar_outer.data()[ord],
+            dst.size());
+  }
+  return result;
+}
+
+/// Grow `result` in place so every cell whose current inner cell is null but
+/// `more_range_fn(cell_ordinal)` yields a non-empty range becomes an
+/// allocated, zero-initialized cell. Data already accumulated in non-empty
+/// cells is preserved -- a fresh slab is built and the old cell data copied
+/// over. A no-op (no reallocation) when nothing grows, so the steady-state
+/// path stays cheap. Used by the SUMMA ToT contraction, which shapes a result
+/// tile from its first K-panel only and must extend it for later panels of a
+/// contracted-dimension-sparse ToT operand.
+template <typename OuterTensor, typename MoreRangeFn>
+void arena_tot_grow_inplace(OuterTensor& result, MoreRangeFn&& more_range_fn) {
+  using inner_t = typename OuterTensor::value_type;
+  using elem_t = typename inner_t::value_type;
+  using inner_range_t = typename inner_t::range_type;
+  const std::size_t N_cells = result.range().volume() * result.nbatch();
+  std::vector<inner_range_t> ranges;
+  ranges.reserve(N_cells);
+  bool grows = false;
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    const auto& rc = result.data()[ord];
+    if (!rc.empty()) {
+      ranges.emplace_back(rc.range());
+      continue;
+    }
+    inner_range_t r = more_range_fn(ord);
+    if (r.volume() != 0) grows = true;
+    ranges.emplace_back(std::move(r));
+  }
+  if (!grows) return;
+  OuterTensor grown = arena_outer_init<OuterTensor>(
+      result.range(), result.nbatch(),
+      [&ranges](std::size_t ord) -> inner_range_t { return ranges[ord]; });
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    const auto& src = result.data()[ord];
+    if (src.empty()) continue;
+    auto& dst = grown.data()[ord];
+    TA_ASSERT(!dst.empty() && dst.size() == src.size());
+    const elem_t* s = src.data();
+    elem_t* d = dst.data();
+    for (std::size_t i = 0; i < src.size(); ++i) d[i] = s[i];
+  }
+  result = std::move(grown);
+}
+
+/// Accumulate `arg` into `result` (`result += arg`), first growing `result`
+/// to the union of the two tiles' inner-cell sparsity. Either tile may be
+/// outer-empty. Used to combine two partial contraction results whose
+/// disjoint K-panel subsets induced different inner-cell sparsity.
+template <typename OuterTensor>
+void arena_tot_add_to(OuterTensor& result, const OuterTensor& arg) {
+  using inner_t = typename OuterTensor::value_type;
+  using elem_t = typename inner_t::value_type;
+  using inner_range_t = typename inner_t::range_type;
+  if (arg.empty()) return;
+  auto arg_range_fn = [&arg](std::size_t ord) -> inner_range_t {
+    const auto& a = arg.data()[ord];
+    return a.empty() ? inner_range_t{} : a.range();
+  };
+  if (result.empty()) {
+    result =
+        arena_outer_init<OuterTensor>(arg.range(), arg.nbatch(), arg_range_fn);
+  } else {
+    TA_ASSERT(result.range().volume() == arg.range().volume());
+    TA_ASSERT(result.nbatch() == arg.nbatch());
+    arena_tot_grow_inplace(result, arg_range_fn);
+  }
+  const std::size_t N_cells = arg.range().volume() * arg.nbatch();
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    const auto& src = arg.data()[ord];
+    if (src.empty()) continue;
+    auto& dst = result.data()[ord];
+    TA_ASSERT(!dst.empty() && dst.size() == src.size());
+    const elem_t* s = src.data();
+    elem_t* d = dst.data();
+    for (std::size_t i = 0; i < src.size(); ++i) d[i] += s[i];
+  }
+}
+
+/// Shallow-permute outer cells while preserving inner storage. The result
+/// shares the source's inner storage (arena slab or aliased element data);
+/// only the outer-cell array is rebuilt in permuted order.
+template <typename OuterTensor, typename SrcOuterTensor, typename Perm>
+OuterTensor arena_permute_shallow(const SrcOuterTensor& src, const Perm& perm) {
+  using inner_t = typename OuterTensor::value_type;
+  TA_ASSERT(perm);
+  TA_ASSERT(perm.size() == src.range().rank());
+  auto perm_range = perm * src.range();
+  const std::size_t N_cells = src.range().volume();
+  const std::size_t total_cells = N_cells * src.nbatch();
+  auto data = make_outer_data<OuterTensor>(
+      total_cells, std::make_shared<Arena>(), src.data_shared());
+  OuterTensor result(perm_range, src.nbatch(), std::move(data));
+  for (std::size_t s = 0; s < N_cells; ++s) {
+    auto src_idx = src.range().idx(s);
+    auto tgt_ord = perm_range.ordinal(perm * src_idx);
+    for (std::size_t b = 0; b < src.nbatch(); ++b) {
+      const std::size_t s_off = b * N_cells + s;
+      const std::size_t t_off = b * N_cells + tgt_ord;
+      const inner_t& src_inner = src.data()[s_off];
+      if constexpr (is_arena_tensor_v<inner_t>) {
+        // The view is 8 bytes; copy rebinds it to the same Cell. The source's
+        // arena is kept alive by the keep-alive captured in the deleter.
+        ::new (result.data() + t_off) inner_t(src_inner);
+      } else {
+        auto src_inner_data = const_cast<inner_t&>(src_inner).data_shared();
+        ::new (result.data() + t_off) inner_t(
+            src_inner.range(), src_inner.nbatch(), std::move(src_inner_data));
+      }
+    }
+  }
+  return result;
+}
+
+/// Permute the inner modes of every cell of a slab-backed ToT outer tile.
+///
+/// Produces a fresh slab-backed tile with the same outer layout as `src`,
+/// but with each inner cell's range and data permuted by `inner_perm`
+/// (`result_cell(inner_perm * i) == src_cell(i)`). This is the slab-level
+/// counterpart of a per-cell permute: the owning tile allocates one new
+/// slab and rewrites every cell, so no view inner cell is ever asked to
+/// value-return. `inner_perm` is a plain (non-bipartite) permutation whose
+/// rank matches the inner-cell rank.
+template <typename OuterTensor, typename SrcOuterTensor, typename Perm>
+OuterTensor arena_inner_permute(const SrcOuterTensor& src,
+                                const Perm& inner_perm) {
+  using inner_t = typename OuterTensor::value_type;
+  using elem_t = typename inner_t::value_type;
+  using inner_range_t = typename inner_t::range_type;
+  TA_ASSERT(inner_perm);
+  const std::size_t rank = inner_perm.size();
+
+  // result cell range = inner_perm applied to the src cell range; a null
+  // src cell maps to a default (null) range -> a null result cell.
+  auto range_fn = [&src, &inner_perm, rank](std::size_t ord) -> inner_range_t {
+    const auto& s = src.data()[ord];
+    if (s.empty()) return inner_range_t{};
+    TA_ASSERT(static_cast<std::size_t>(s.range().rank()) == rank);
+    const auto& se = s.range().extent();
+    std::vector<std::size_t> ext(rank);
+    for (std::size_t d = 0; d < rank; ++d)
+      ext[d] = static_cast<std::size_t>(se[d]);
+    return inner_range_t(inner_perm * ext);
+  };
+  // The permute writes every result element exactly once, so no zero-init.
+  OuterTensor result = arena_outer_init<OuterTensor>(src.range(), src.nbatch(),
+                                                     range_fn, alignof(elem_t),
+                                                     /*zero_init=*/false);
+
+  const std::size_t N_cells = src.range().volume() * src.nbatch();
+  // Per-cell scratch (rank is fixed across cells); reused, not reallocated.
+  std::vector<std::size_t> dstride(rank), w(rank), ctr(rank);
+  for (std::size_t ord = 0; ord < N_cells; ++ord) {
+    auto& dst = result.data()[ord];
+    if (dst.empty()) continue;
+    const auto& s = src.data()[ord];
+    const auto& se = s.range().extent();
+    const auto& de = dst.range().extent();
+    // row-major strides of the (permuted) destination cell
+    dstride[rank - 1] = 1;
+    for (std::size_t d = rank - 1; d > 0; --d)
+      dstride[d - 1] = dstride[d] * static_cast<std::size_t>(de[d]);
+    // w[d] = destination stride contributed by source dimension d, since
+    // source dim d maps to destination dim inner_perm[d].
+    for (std::size_t d = 0; d < rank; ++d)
+      w[d] = dstride[static_cast<std::size_t>(inner_perm[d])];
+    // walk the source cell in row-major order, scattering into the dst cell
+    ctr.assign(rank, 0);
+    const std::size_t vol = s.size();
+    const elem_t* sd = s.data();
+    elem_t* dd = dst.data();
+    for (std::size_t so = 0; so < vol; ++so) {
+      std::size_t dofs = 0;
+      for (std::size_t d = 0; d < rank; ++d) dofs += w[d] * ctr[d];
+      dd[dofs] = sd[so];
+      for (std::size_t d = rank; d-- > 0;) {
+        if (++ctr[d] < static_cast<std::size_t>(se[d])) break;
+        ctr[d] = 0;
+      }
+    }
+  }
+  return result;
+}
+
+}  // namespace detail
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_TENSOR_ARENA_KERNELS_H__INCLUDED
diff --git a/src/TiledArray/tensor/arena_tensor.h b/src/TiledArray/tensor/arena_tensor.h
new file mode 100644
index 0000000000..b4c3d4959e
--- /dev/null
+++ b/src/TiledArray/tensor/arena_tensor.h
@@ -0,0 +1,535 @@
+/// ToT inner-tile type: pimpl-style pinned tensor backed by an arena cell.
+///
+/// `ArenaTensor<T, Range>` is one pointer wide. Its referent is a `Cell`
+/// (range header + co-located element storage, aligned for both) that the
+/// outer tile's arena allocates and owns. The `ArenaTensor` itself is
+/// non-owning; copies/moves rebind the pointer. Lifetime is bounded by the
+/// outer tile that owns the arena slab.
+
+#ifndef TILEDARRAY_TENSOR_ARENA_TENSOR_H__INCLUDED
+#define TILEDARRAY_TENSOR_ARENA_TENSOR_H__INCLUDED
+
+#include "TiledArray/error.h"
+#include "TiledArray/math/blas.h"
+#include "TiledArray/math/gemm_helper.h"
+#include "TiledArray/tensor/type_traits.h"
+
+#include <btas/zb/range.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+namespace TiledArray {
+
+/// Alignment of in-arena element storage, in bytes. Sized to cover the
+/// widest common SIMD register (AVX-512 ZMM = 64 B) and a single x86_64
+/// cache line. Override at configure time by defining
+/// TILEDARRAY_INNER_SIMD_ALIGN to a larger power-of-two (e.g. 128 for
+/// two-cache-line floor / Apple-Silicon L1 line size).
+#ifndef TILEDARRAY_INNER_SIMD_ALIGN
+#define TILEDARRAY_INNER_SIMD_ALIGN 64
+#endif
+
+inline constexpr std::size_t kInnerSimdAlign = TILEDARRAY_INNER_SIMD_ALIGN;
+static_assert((kInnerSimdAlign & (kInnerSimdAlign - 1)) == 0,
+              "kInnerSimdAlign must be a power of two");
+
+template <typename T, typename Range_ = ::btas::zb::RangeNd<>>
+class ArenaTensor;
+
+// Forward decls of the free in-place CPOs (defined below). Needed so the
+// member compound operators and member in-place CPOs on `ArenaTensor` can
+// reference them.
+template <typename T, typename R, typename Scalar>
+void scale_to(ArenaTensor<T, R>& dst, Scalar factor);
+template <typename T, typename R>
+void add_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src);
+template <typename T, typename R>
+void subt_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src);
+template <typename T, typename R>
+void mult_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src);
+template <typename T, typename R, typename Scalar>
+void axpy_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src,
+             Scalar alpha);
+
+template <typename T, typename Range_>
+class ArenaTensor {
+ public:
+  using value_type = T;
+  using numeric_type = typename detail::numeric_type<T>::type;
+  using scalar_type = typename detail::scalar_type<T>::type;
+  using range_type = Range_;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using reference = T&;
+  using const_reference = const T&;
+  using size_type = std::size_t;
+
+  /// In-arena layout: range header, then padding, then element storage.
+  struct Cell {
+    range_type range;
+  };
+
+  /// Alignment of the element pointer past the cell header. Caller-owned
+  /// arena slots must honour this so SIMD loads/stores on `data()` are
+  /// aligned without an extra runtime check.
+  static constexpr size_type data_alignment() noexcept {
+    return alignof(T) > kInnerSimdAlign ? alignof(T) : kInnerSimdAlign;
+  }
+
+  /// Offset (in bytes) of the first element past the cell header.
+  static constexpr size_type data_offset() noexcept {
+    constexpr size_type a = data_alignment();
+    return (sizeof(Cell) + a - 1) & ~(a - 1);
+  }
+
+  /// Total bytes a cell holding `n` elements consumes in the arena.
+  static constexpr size_type cell_size(size_type n) noexcept {
+    return data_offset() + n * sizeof(T);
+  }
+
+  /// Required alignment of a cell allocation. At least `data_alignment()`
+  /// so that `cell_base + data_offset()` lands on a SIMD boundary, and at
+  /// least `alignof(Cell)` so the range header is well-aligned.
+  static constexpr size_type cell_alignment() noexcept {
+    constexpr size_type da = data_alignment();
+    return alignof(Cell) > da ? alignof(Cell) : da;
+  }
+
+  ArenaTensor() = default;
+  ArenaTensor(const ArenaTensor&) = default;
+  /// Move construction transfers the view and leaves the source null.
+  ArenaTensor(ArenaTensor&& other) noexcept : cell_(other.cell_) {
+    other.cell_ = nullptr;
+  }
+  ~ArenaTensor() = default;
+
+  /// Unified assignment, with two regimes keyed on whether `*this` is bound:
+  ///  - bound (non-null) assignee: deep element-wise copy from `src` -- the
+  ///    view's storage already exists, so assignment writes into it;
+  ///  - null assignee: a shallow rebind of the view to `src`'s cell -- there
+  ///    is no storage to deep-copy into.
+  /// This must be a user-provided non-template operator: the implicit
+  /// copy-assignment (a shallow pointer copy) would otherwise be generated
+  /// and, as a non-template exact match, would always shadow the templated
+  /// `operator=` below for `ArenaTensor` sources. There is deliberately no
+  /// move-assignment -- an rvalue `ArenaTensor` binds here and follows the
+  /// same two regimes (moving a view buys nothing over copying it).
+  ArenaTensor& operator=(const ArenaTensor& src) {
+    if (cell_ == nullptr) {
+      cell_ = src.cell_;  // null assignee: rebind the view (shallow)
+      return *this;
+    }
+    return assign_elements_(src);  // bound assignee: deep copy
+  }
+
+  /// Construct a view onto a `Cell` (placement-newed by the arena factory).
+  explicit ArenaTensor(Cell* cell) noexcept : cell_(cell) {}
+
+  /// True if the view points at a non-null cell.
+  explicit operator bool() const noexcept { return cell_ != nullptr; }
+
+  /// True if the view is null (no cell).
+  bool empty() const noexcept { return cell_ == nullptr; }
+
+  /// Range of the referenced cell. UB if null.
+  const range_type& range() const noexcept {
+    TA_ASSERT(cell_ != nullptr);
+    return cell_->range;
+  }
+
+  /// Pointer to the first element. Null when the view is null.
+  pointer data() noexcept {
+    if (cell_ == nullptr) return nullptr;
+    auto* base = reinterpret_cast<std::byte*>(cell_);
+    return std::launder(reinterpret_cast<pointer>(base + data_offset()));
+  }
+
+  const_pointer data() const noexcept {
+    if (cell_ == nullptr) return nullptr;
+    auto* base = reinterpret_cast<const std::byte*>(cell_);
+    return std::launder(reinterpret_cast<const_pointer>(base + data_offset()));
+  }
+
+  /// Element count of the referenced cell, or 0 if null.
+  size_type size() const noexcept {
+    return cell_ != nullptr ? cell_->range.volume() : 0;
+  }
+
+  reference operator[](size_type i) noexcept {
+    TA_ASSERT(cell_ != nullptr);
+    return data()[i];
+  }
+  const_reference operator[](size_type i) const noexcept {
+    TA_ASSERT(cell_ != nullptr);
+    return data()[i];
+  }
+
+  /// Sum of all elements; `value_type{}` for a null view. A scalar
+  /// reduction allocates nothing, so it is valid on a view (unlike the
+  /// value-returning tensor ops, which are deliberately absent).
+  value_type sum() const noexcept {
+    value_type acc{};
+    if (cell_ == nullptr) return acc;
+    const auto* s = data();
+    for (size_type i = 0; i < size(); ++i) acc += s[i];
+    return acc;
+  }
+
+  /// Element-wise deep copy from a non-`ArenaTensor` tensor `src`. Valid only
+  /// for a bound (non-null) assignee: a null view has no storage to copy into
+  /// and a non-view `src` has no cell to rebind to (use the `ArenaTensor`
+  /// overload above for the rebind regime).
+  template <typename Src,
+            typename = std::enable_if_t<detail::is_tensor_v<Src> &&
+                                        !std::is_same_v<Src, ArenaTensor>>>
+  ArenaTensor& operator=(const Src& src) {
+    TA_ASSERT(cell_ != nullptr &&
+              "cannot assign a non-ArenaTensor source to a null ArenaTensor");
+    return assign_elements_(src);
+  }
+
+  /// In-place compound operators -- ArenaTensor is a view (no allocation),
+  /// so it provides only the *mutating* counterparts to the value-returning
+  /// `+`, `-`, `*` operators. Each delegates to the same-named free CPO
+  /// (forward-declared above, defined later in this header). The pair
+  /// (ArenaTensor x ArenaTensor) is the only one needed by TA's kernel
+  /// paths. Calls are fully-qualified to avoid recursing into the member
+  /// overloads of the same names below.
+  ArenaTensor& operator+=(const ArenaTensor& other) {
+    ::TiledArray::add_to(*this, other);
+    return *this;
+  }
+  ArenaTensor& operator-=(const ArenaTensor& other) {
+    ::TiledArray::subt_to(*this, other);
+    return *this;
+  }
+  ArenaTensor& operator*=(const ArenaTensor& other) {
+    ::TiledArray::mult_to(*this, other);
+    return *this;
+  }
+  // Scalar `*=` is intentionally not a member: the free `operator*=(T&&, N)`
+  // in operators_body.ipp already covers `view *= scalar`. A member template
+  // alongside it ties under gcc-13's overload resolution (ambiguous), so the
+  // free operator is the single provider of arena-cell `*= scalar`.
+
+  /// Member-call mirrors of the free in-place CPOs. Tile-interface paths
+  /// (`add_to(result, arg)`, `subt_to`, etc.) and `Tensor`'s legacy
+  /// `inplace_binary` use these. Bodies fully-qualify the free CPO call so
+  /// the member doesn't recurse into itself.
+  ArenaTensor& add_to(const ArenaTensor& other) {
+    ::TiledArray::add_to(*this, other);
+    return *this;
+  }
+  ArenaTensor& subt_to(const ArenaTensor& other) {
+    ::TiledArray::subt_to(*this, other);
+    return *this;
+  }
+  ArenaTensor& mult_to(const ArenaTensor& other) {
+    ::TiledArray::mult_to(*this, other);
+    return *this;
+  }
+  template <typename Scalar>
+    requires(detail::is_numeric_v<Scalar>)
+  ArenaTensor& scale_to(const Scalar factor) {
+    ::TiledArray::scale_to(*this, factor);
+    return *this;
+  }
+  ArenaTensor& neg_to() {
+    ::TiledArray::scale_to(*this, -T(1));
+    return *this;
+  }
+
+  /// axpy: <tt>*this += other * factor</tt> (axpy semantics; factor scales
+  /// only the added operand). Delegates to the free `axpy` CPO that the
+  /// outer-cell loop ultimately calls. Distinct from
+  /// `add_to(other, factor)` which would be the legacy
+  /// `(*this + other) * factor` semantics -- view tile types don't have
+  /// `operator+=` returning a value, so we keep the names separated.
+  template <typename Scalar>
+    requires(detail::is_numeric_v<Scalar>)
+  ArenaTensor& axpy_to(const ArenaTensor& other, const Scalar factor) {
+    ::TiledArray::axpy_to(*this, other, factor);
+    return *this;
+  }
+
+  /// axpy + fused permutation. ArenaTensor is a fixed-layout view, so any
+  /// non-empty permutation is rejected at runtime.
+  template <typename Scalar, typename Perm>
+    requires(detail::is_numeric_v<Scalar> && detail::is_permutation_v<Perm>)
+  ArenaTensor& axpy_to(const ArenaTensor& other, const Scalar factor,
+                       const Perm& perm) {
+    TA_EXCEPTION(
+        "ArenaTensor::axpy_to(other, factor, perm): inner permutation is not "
+        "supported for view cells");
+    return *this;
+  }
+
+  /// Internal accessor for the cell pointer. Used by the arena factory and
+  /// by destruction walks; not part of the user-facing surface.
+  Cell* cell() const noexcept { return cell_; }
+
+ private:
+  /// Deep element-wise copy into this bound view's storage from any tensor
+  /// `src` of matching volume (an `ArenaTensor` or an owning tensor alike).
+  template <typename Src>
+  ArenaTensor& assign_elements_(const Src& src) {
+    TA_ASSERT(cell_ != nullptr);
+    TA_ASSERT(size() == static_cast<size_type>(src.size()));
+    auto* dst = data();
+    const auto* src_data = src.data();
+    for (size_type i = 0; i < size(); ++i) dst[i] = src_data[i];
+    return *this;
+  }
+
+  Cell* cell_ = nullptr;
+};
+
+namespace detail {
+
+/// Placement-construct an `ArenaTensor<T, R>` at the given pre-aligned,
+/// pre-sized buffer. `buffer` must be at least
+/// `ArenaTensor<T,R>::cell_size(range.volume())` bytes and aligned to
+/// `ArenaTensor<T,R>::cell_alignment()`. Element storage is
+/// value-initialized (zero for arithmetic `T`).
+template <typename T, typename R>
+ArenaTensor<T, R> make_arena_tensor_in(std::byte* buffer, R range) {
+  using Inner = ArenaTensor<T, R>;
+  using Cell = typename Inner::Cell;
+  TA_ASSERT(buffer != nullptr);
+  TA_ASSERT(
+      reinterpret_cast<std::uintptr_t>(buffer) % Inner::cell_alignment() == 0);
+  const std::size_t n = range.volume();
+  Cell* cell = ::new (static_cast<void*>(buffer)) Cell{std::move(range)};
+  T* elems = reinterpret_cast<T*>(buffer + Inner::data_offset());
+  if constexpr (std::is_trivially_constructible_v<T>) {
+    std::memset(elems, 0, n * sizeof(T));
+  } else {
+    for (std::size_t i = 0; i < n; ++i)
+      ::new (static_cast<void*>(elems + i)) T();
+  }
+  return Inner(cell);
+}
+
+/// Destruct in-place. Mirrors `make_arena_tensor_in`'s construction. Safe
+/// on a null view (no-op). After this call the cell memory is uninitialized;
+/// the arena slab still owns the bytes.
+template <typename T, typename R>
+void destruct_arena_tensor(ArenaTensor<T, R>& inner) noexcept {
+  auto* cell = inner.cell();
+  if (cell == nullptr) return;
+  const std::size_t n = cell->range.volume();
+  if constexpr (!std::is_trivially_destructible_v<T>) {
+    T* elems = inner.data();
+    for (std::size_t i = 0; i < n; ++i) elems[i].~T();
+  }
+  if constexpr (!std::is_trivially_destructible_v<R>) {
+    cell->~Cell();
+  }
+}
+
+}  // namespace detail
+
+/// `is_tensor_view<T>` is forward-declared in `tensor/type_traits.h` (primary
+/// = `std::false_type`). Specializations for the concrete view types live
+/// below; `external/btas.h` adds a spec for `btas::TensorView`. Distinct
+/// from `is_tensor_helper`, which is also true for views (they are tensors
+/// structurally) -- `is_tensor_view` is the *secondary* gate that opts views
+/// out of value-returning member-call paths.
+
+/// True iff `T` is some `ArenaTensor<U, R>` -- the arena-pinned view type.
+/// Implies `is_tensor_view_v<T>`. Use this trait only where arena slab
+/// machinery is actually managed (e.g. clone, serialize, value-returning
+/// add/subt/mult that allocate via `arena_trivial_*_pinned`); for the
+/// "no value-returning ops on a view" gating use `is_tensor_view_v` instead.
+template <typename T>
+struct is_arena_tensor : std::false_type {};
+template <typename T, typename R>
+struct is_arena_tensor<ArenaTensor<T, R>> : std::true_type {};
+template <typename T>
+inline constexpr bool is_arena_tensor_v = is_arena_tensor<T>::value;
+
+// Every ArenaTensor is a view.
+template <typename T, typename R>
+struct is_tensor_view<ArenaTensor<T, R>> : std::true_type {};
+
+namespace detail {
+
+/// Register `ArenaTensor` as a tensor: it has the same `.data()` / `.size()`
+/// flat-contiguous-storage shape as `TA::Tensor`. This makes
+/// `is_tensor<ArenaTensor>` true and `is_tensor_of_tensor<Tensor<ArenaTensor>>`
+/// true via the existing recursion, so kernel-level dispatches
+/// (`tensor_reduce`, `inplace_tensor_op`, `tensor_op`, ...) match the same
+/// overloads they do for `TA::Tensor<double>` without bespoke arena
+/// overloads. To keep ArenaTensor out of value-returning member-call paths
+/// (which require allocation that views can't do), `ta_ops_match_tensor` is
+/// specialized below to false for `ArenaTensor`.
+template <typename T, typename R>
+struct is_tensor_helper<ArenaTensor<T, R>> : public std::true_type {};
+
+/// ArenaTensor's element storage is contiguous and row-major.
+template <typename T, typename R>
+struct is_contiguous_tensor_helper<ArenaTensor<T, R>> : public std::true_type {
+};
+
+/// `ArenaTensor` counts as one nesting level, so `Tensor<ArenaTensor<T>>`
+/// out-ranks a plain `Tensor<T>`. Without this, `nested_rank<ArenaTensor>`
+/// falls through to the primary `= 0` and `einsum`'s `MaxNestedArray` ties a
+/// ToT arena array with a plain array, picking the wrong result tile type.
+template <typename T, typename R>
+constexpr size_t nested_rank<ArenaTensor<T, R>> = 1 + nested_rank<T>;
+
+template <typename T, typename R>
+constexpr size_t nested_rank<const ArenaTensor<T, R>> =
+    nested_rank<ArenaTensor<T, R>>;
+
+}  // namespace detail
+
+// Note: `detail::TensorInterface` (a.k.a. `TA::TensorMap`) is non-owning,
+// but it *does* provide value-returning member arithmetic (`.add()`,
+// `.subt()`, ...) that materializes a fresh tensor. So it does NOT
+// participate in `is_tensor_view` -- this trait is reserved for views that
+// lack value-returning member arith (cannot allocate on their own), like
+// `ArenaTensor` and `btas::TensorView`.
+
+}  // namespace TiledArray
+
+// btas::TensorView is btas's existing non-owning view type. Register it as
+// a view too. Forward-declared here (signature mirrors btas/tensorview.h)
+// to avoid pulling that header into arena_tensor.h.
+namespace btas {
+template <typename _T, class _Range, class _Storage, class _Policy>
+class TensorView;
+}  // namespace btas
+
+namespace TiledArray {
+template <typename T, class R, class S, class P>
+struct is_tensor_view<::btas::TensorView<T, R, S, P>> : std::true_type {};
+
+/// Zero all elements of `dst`. No-op on a null view.
+template <typename T, typename R>
+void zero(ArenaTensor<T, R>& dst) noexcept {
+  if (!dst) return;
+  std::memset(dst.data(), 0, dst.size() * sizeof(T));
+}
+
+/// Fill `dst` with `value`. No-op on a null view.
+template <typename T, typename R, typename U>
+void fill(ArenaTensor<T, R>& dst, const U& value) {
+  if (!dst) return;
+  std::fill_n(dst.data(), dst.size(), static_cast<T>(value));
+}
+
+/// `dst *= factor`. No-op on a null view.
+template <typename T, typename R, typename Scalar>
+void scale_to(ArenaTensor<T, R>& dst, Scalar factor) {
+  if (!dst) return;
+  auto* d = dst.data();
+  const auto n = dst.size();
+  for (std::size_t i = 0; i < n; ++i) d[i] *= factor;
+}
+
+/// `dst += src`. Asserts both views non-null and shape-compatible.
+template <typename T, typename R>
+void add_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src) {
+  if (!dst || !src) return;
+  TA_ASSERT(dst.size() == src.size());
+  auto* d = dst.data();
+  const auto* s = src.data();
+  for (std::size_t i = 0; i < dst.size(); ++i) d[i] += s[i];
+}
+
+/// `dst -= src`. Asserts both views non-null and shape-compatible.
+template <typename T, typename R>
+void subt_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src) {
+  if (!dst || !src) return;
+  TA_ASSERT(dst.size() == src.size());
+  auto* d = dst.data();
+  const auto* s = src.data();
+  for (std::size_t i = 0; i < dst.size(); ++i) d[i] -= s[i];
+}
+
+/// `dst *= src` element-wise. Asserts both views non-null and shape-compatible.
+template <typename T, typename R>
+void mult_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src) {
+  if (!dst || !src) return;
+  TA_ASSERT(dst.size() == src.size());
+  auto* d = dst.data();
+  const auto* s = src.data();
+  for (std::size_t i = 0; i < dst.size(); ++i) d[i] *= s[i];
+}
+
+/// `dst += src * alpha` (in-place BLAS-like AXPY). Asserts both views
+/// non-null and shape-compatible. Argument order matches TA's `_to` CPO
+/// convention `(result, arg, factor)`; the BLAS name AXPY captures the
+/// semantics (in-place, not value-producing).
+template <typename T, typename R, typename Scalar>
+void axpy_to(ArenaTensor<T, R>& dst, const ArenaTensor<T, R>& src,
+             Scalar alpha) {
+  if (!dst || !src) return;
+  TA_ASSERT(dst.size() == src.size());
+  auto* d = dst.data();
+  const auto* s = src.data();
+  for (std::size_t i = 0; i < dst.size(); ++i) d[i] += alpha * s[i];
+}
+
+/// Sum of squared elements; 0 for null views.
+template <typename T, typename R>
+auto squared_norm(const ArenaTensor<T, R>& src) noexcept {
+  T acc{};
+  if (!src) return acc;
+  const auto* s = src.data();
+  for (std::size_t i = 0; i < src.size(); ++i) acc += s[i] * s[i];
+  return acc;
+}
+
+/// Copy `src` into a freshly-allocated `Standalone`. Returns a default-
+/// constructed (null) `Standalone` when `src` is null.
+template <typename Standalone, typename T, typename R>
+Standalone materialize(const ArenaTensor<T, R>& src) {
+  if (!src) return Standalone();
+  Standalone out(src.range());
+  std::copy_n(src.data(), src.size(), out.data());
+  return out;
+}
+
+/// GEMM CPO for `ArenaTensor`: accumulates `result += factor * left * right`
+/// via BLAS. The result must be pre-allocated (e.g. zero-initialized by
+/// `arena_outer_init`) -- this overload never resizes. More specific
+/// than `tile_op/tile_interface.h`'s generic `gemm` template (which would
+/// otherwise fall through to a nonexistent `result.gemm(...)` member),
+/// so partial ordering picks it for `ArenaTensor` arguments.
+template <typename T, typename R, typename Scalar>
+auto gemm(ArenaTensor<T, R>& result, const ArenaTensor<T, R>& left,
+          const ArenaTensor<T, R>& right, Scalar factor,
+          const math::GemmHelper& gemm_helper)
+    -> std::enable_if_t<detail::is_numeric_v<Scalar>, ArenaTensor<T, R>&> {
+  if (!left || !right) return result;
+  TA_ASSERT(bool(result));
+  TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
+  TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
+
+  using integer = math::blas::integer;
+  integer M, N, K;
+  gemm_helper.compute_matrix_sizes(M, N, K, left.range(), right.range());
+
+  const integer lda =
+      (gemm_helper.left_op() == math::blas::NoTranspose) ? K : M;
+  const integer ldb =
+      (gemm_helper.right_op() == math::blas::NoTranspose) ? N : K;
+  const integer ldc = N;
+
+  math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), M, N, K,
+                   static_cast<T>(factor), left.data(), lda, right.data(), ldb,
+                   T(1), result.data(), ldc);
+  return result;
+}
+
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_TENSOR_ARENA_TENSOR_H__INCLUDED
diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h
index c68bad8f7a..3fdf86df75 100644
--- a/src/TiledArray/tensor/kernels.h
+++ b/src/TiledArray/tensor/kernels.h
@@ -28,6 +28,7 @@
 
 #include <TiledArray/einsum/index.h>
 #include <TiledArray/math/gemm_helper.h>
+#include <TiledArray/tensor/arena_tensor.h>
 #include <TiledArray/tensor/permute.h>
 #include <TiledArray/tensor/utility.h>
 #include <TiledArray/util/vector.h>
@@ -1266,6 +1267,53 @@ auto tensor_contract(TensorA const& A, TensorB const& B,
   return plan.do_perm.C ? permute(result, plan.perm.C.inv()) : result;
 }
 
+/// In-place contraction. Accumulates `factor * (A contracted with B per
+/// plan)` into `result` with beta=1 -- `result` must be pre-allocated and
+/// zero-initialized (or carry an existing partial sum to add into).
+///
+/// Fast path: when `plan.do_perm.{A,B,C}` are all false (the canonical
+/// alignment the expression engine produces), the contraction is exactly
+/// one GEMM into `result` via the free `gemm` CPO. Works uniformly for
+/// `TA::Tensor` and `ArenaTensor` inner cells.
+///
+/// Slow path: when any operand requires permutation, the value-returning
+/// `tensor_contract` is called and its result is accumulated into `result`
+/// via free `add_to`. This requires materialization, which is incompatible
+/// with `ArenaTensor`'s pinned-storage contract; for arena cells the
+/// non-canonical case throws (the expression engine should pre-align).
+template <typename ResultTensor, typename TensorA, typename TensorB,
+          typename Annot, typename Scalar,
+          typename = std::enable_if_t<is_annotation_v<Annot>>>
+ResultTensor& tensor_contract_to(ResultTensor& result, TensorA const& A,
+                                 TensorB const& B, Scalar factor,
+                                 const TensorContractionPlan<Annot>& plan) {
+  if (!plan.do_perm.A && !plan.do_perm.B && !plan.do_perm.C) {
+    return gemm(result, A, B, factor, plan.gemm_helper);
+  }
+  constexpr bool any_arena = ::TiledArray::is_arena_tensor_v<ResultTensor> ||
+                             ::TiledArray::is_arena_tensor_v<TensorA> ||
+                             ::TiledArray::is_arena_tensor_v<TensorB>;
+  if constexpr (any_arena) {
+    TA_EXCEPTION(
+        "tensor_contract_to: non-canonical plan (do_perm.{A,B,C} not all "
+        "false) is unsupported for ArenaTensor cells; the expression "
+        "engine should pre-align inner modes to the canonical layout.");
+    return result;
+  } else {
+    // Value-semantic slow path. tensor_contract internally uses alpha=1;
+    // restrict callers here to factor=1 so the math matches. Regime-A
+    // always passes factor=1; lift this restriction only if a real caller
+    // needs a non-unit scale on the non-canonical path.
+    using Numeric = typename ResultTensor::numeric_type;
+    TA_ASSERT(static_cast<Numeric>(factor) == Numeric{1} &&
+              "tensor_contract_to: non-canonical plan currently supports "
+              "factor == 1 only");
+    auto prod = tensor_contract(A, B, plan);
+    if (!prod.empty()) add_to(result, prod);
+    return result;
+  }
+}
+
 /// contracts 2 tensors, with 1 plan construction per call.
 /// Thus this is inefficient; plan should be constructed separately and then
 /// used to for multiple calls (see the variant of this function that
diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h
index 05636c3d7d..24fd81c89f 100644
--- a/src/TiledArray/tensor/operators.h
+++ b/src/TiledArray/tensor/operators.h
@@ -50,7 +50,8 @@ namespace TiledArray {
 /// \return A tensor where element \c i is equal to <tt>tensor[i] + number</tt>
 template <typename T1,
           typename = std::enable_if_t<
-              TA::detail::is_nested_tensor_v<TA::detail::remove_cvr_t<T1>>>>
+              TA::detail::is_nested_tensor_v<TA::detail::remove_cvr_t<T1>> &&
+              !TA::is_tensor_view_v<TA::detail::remove_cvr_t<T1>>>>
 inline decltype(auto) operator+(
     T1&& tensor, TA::detail::numeric_t<TA::detail::remove_cvr_t<T1>> number) {
   return std::forward<T1>(tensor).add(number);
@@ -65,7 +66,8 @@ inline decltype(auto) operator+(
 /// \return A tensor where element \c i is equal to <tt>tensor[i] + number</tt>
 template <typename T1,
           typename = std::enable_if_t<
-              TA::detail::is_nested_tensor_v<TA::detail::remove_cvr_t<T1>>>>
+              TA::detail::is_nested_tensor_v<TA::detail::remove_cvr_t<T1>> &&
+              !TA::is_tensor_view_v<TA::detail::remove_cvr_t<T1>>>>
 inline decltype(auto) operator+(
     TA::detail::numeric_t<TA::detail::remove_cvr_t<T1>> number, T1&& tensor) {
   return std::forward<T1>(tensor).add(number);
@@ -80,7 +82,8 @@ inline decltype(auto) operator+(
 /// \return A tensor where element \c i is equal to <tt>tensor[i] - number</tt>
 template <typename T1,
           typename = std::enable_if_t<
-              TA::detail::is_nested_tensor_v<TA::detail::remove_cvr_t<T1>>>>
+              TA::detail::is_nested_tensor_v<TA::detail::remove_cvr_t<T1>> &&
+              !TA::is_tensor_view_v<TA::detail::remove_cvr_t<T1>>>>
 inline decltype(auto) operator-(
     T1&& tensor, TA::detail::numeric_t<TA::detail::remove_cvr_t<T1>> number) {
   return std::forward<T1>(tensor).subt(number);
diff --git a/src/TiledArray/tensor/operators_body.ipp b/src/TiledArray/tensor/operators_body.ipp
index 4e2d736a84..6b625b82b2 100644
--- a/src/TiledArray/tensor/operators_body.ipp
+++ b/src/TiledArray/tensor/operators_body.ipp
@@ -77,11 +77,15 @@ inline decltype(auto) operator*(N number, T&& tensor) {
   return scale(std::forward<T>(tensor), number);
 }
 
-/// tensor += tensor
+/// tensor += tensor -- compound assignment is valid for any tensor whose
+/// storage can be mutated, including views. Gated on the broader
+/// `ta_ops_match_tensor_inplace_v` predicate.
 template <typename T1, typename T2,
           typename = std::enable_if_t<
-              detail::ta_ops_match_tensor_v<TA::detail::remove_cvr_t<T1>> &&
-              detail::ta_ops_match_tensor_v<TA::detail::remove_cvr_t<T2>>>>
+              detail::ta_ops_match_tensor_inplace_v<
+                  TA::detail::remove_cvr_t<T1>> &&
+              detail::ta_ops_match_tensor_inplace_v<
+                  TA::detail::remove_cvr_t<T2>>>>
 inline decltype(auto) operator+=(T1&& left, const T2& right) {
   return add_to(std::forward<T1>(left), right);
 }
@@ -89,8 +93,10 @@ inline decltype(auto) operator+=(T1&& left, const T2& right) {
 /// tensor -= tensor
 template <typename T1, typename T2,
           typename = std::enable_if_t<
-              detail::ta_ops_match_tensor_v<TA::detail::remove_cvr_t<T1>> &&
-              detail::ta_ops_match_tensor_v<TA::detail::remove_cvr_t<T2>>>>
+              detail::ta_ops_match_tensor_inplace_v<
+                  TA::detail::remove_cvr_t<T1>> &&
+              detail::ta_ops_match_tensor_inplace_v<
+                  TA::detail::remove_cvr_t<T2>>>>
 inline decltype(auto) operator-=(T1&& left, const T2& right) {
   return subt_to(std::forward<T1>(left), right);
 }
@@ -98,8 +104,10 @@ inline decltype(auto) operator-=(T1&& left, const T2& right) {
 /// tensor *= tensor (element-wise)
 template <typename T1, typename T2,
           typename = std::enable_if_t<
-              detail::ta_ops_match_tensor_v<TA::detail::remove_cvr_t<T1>> &&
-              detail::ta_ops_match_tensor_v<TA::detail::remove_cvr_t<T2>>>>
+              detail::ta_ops_match_tensor_inplace_v<
+                  TA::detail::remove_cvr_t<T1>> &&
+              detail::ta_ops_match_tensor_inplace_v<
+                  TA::detail::remove_cvr_t<T2>>>>
 inline decltype(auto) operator*=(T1&& left, const T2& right) {
   return mult_to(std::forward<T1>(left), right);
 }
@@ -107,7 +115,8 @@ inline decltype(auto) operator*=(T1&& left, const T2& right) {
 /// tensor *= scalar
 template <typename T, typename N,
           typename = std::enable_if_t<
-              detail::ta_ops_match_tensor_v<TA::detail::remove_cvr_t<T>> &&
+              detail::ta_ops_match_tensor_inplace_v<
+                  TA::detail::remove_cvr_t<T>> &&
               TA::detail::is_numeric_v<N>>>
 inline decltype(auto) operator*=(T&& left, N right) {
   return scale_to(std::forward<T>(left), right);
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index fa04ff7eda..ca67641e83 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -27,6 +27,7 @@
 
 #include "TiledArray/math/blas.h"
 #include "TiledArray/math/gemm_helper.h"
+#include "TiledArray/tensor/arena_kernels.h"
 #include "TiledArray/tensor/complex.h"
 #include "TiledArray/tensor/kernels.h"
 #include "TiledArray/tile_interface/clone.h"
@@ -266,7 +267,8 @@ class Tensor {
   template <typename T_>
   static decltype(auto) value_converter(const T_& arg) {
     using arg_type = detail::remove_cvr_t<decltype(arg)>;
-    if constexpr (detail::is_tensor_v<arg_type>)  // clone nested tensors
+    if constexpr (detail::is_tensor_v<arg_type> &&
+                  !is_tensor_view_v<arg_type>)  // clone owning nested tensors
       return arg.clone();
     else if constexpr (!std::is_same_v<arg_type, value_type>) {  // convert
       if constexpr (std::is_convertible_v<arg_type, value_type>)
@@ -274,7 +276,7 @@ class Tensor {
       else
         return conversions::to<value_type, arg_type>()(arg);
     } else
-      return arg;
+      return arg;  // identity (for views, copy = rebind, no deep clone)
   };
 
   range_type range_;  ///< Range
@@ -369,9 +371,14 @@ class Tensor {
       : Tensor(range, 1, default_construct{false}) {
     const auto n = this->size();
     pointer MADNESS_RESTRICT const data = this->data();
-    Clone<Value, Value> cloner;
-    for (size_type i = 0ul; i < n; ++i)
-      new (data + i) value_type(cloner(value));
+    if constexpr (is_tensor_view_v<Value>) {
+      // Views are rebind-on-copy and lack member `clone`; just copy each.
+      for (size_type i = 0ul; i < n; ++i) new (data + i) value_type(value);
+    } else {
+      Clone<Value, Value> cloner;
+      for (size_type i = 0ul; i < n; ++i)
+        new (data + i) value_type(cloner(value));
+    }
   }
 
   /// Construct a tensor of scalars, setting all elements to the same value
@@ -481,8 +488,13 @@ class Tensor {
     // we do that now
     constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
     constexpr bool is_bperm = detail::is_bipartite_permutation_v<Perm>;
-    // tile ops pass bipartite permutations here even if this is a plain tensor
-    if constexpr (is_tot && is_bperm) {
+    constexpr bool is_view = is_tensor_view_v<value_type>;
+    // tile ops pass bipartite permutations here even if this is a plain tensor.
+    // For view inners, the cell has fixed layout that can't be permuted in
+    // place -- skip the inner-permute pass and rely on callers to arrange
+    // canonical inner indexing (regime-A einsum's `do_perm.{A,B,C}` bailout
+    // guarantees no inner permutation is needed for our paths).
+    if constexpr (is_tot && is_bperm && !is_view) {
       if (inner_size(perm) != 0) {
         const auto inner_perm = inner(perm);
         Permute<value_type, value_type> p;
@@ -493,6 +505,12 @@ class Tensor {
           if (!el.empty()) el = p(el, inner_perm);
         }
       }
+    } else if constexpr (is_tot && is_bperm && is_view) {
+      if (inner_size(perm) != 0) {
+        TA_EXCEPTION(
+            "Tensor<View>: inner permutation requested but view "
+            "cells cannot be permuted in place");
+      }
     }
   }
 
@@ -652,8 +670,21 @@ class Tensor {
   Tensor clone() const& {
     Tensor result;
     if (data_) {
-      if constexpr (detail::is_tensor_of_tensor_v<Tensor>) {
-        result = Tensor(*this, [](value_type const& el) { return el.clone(); });
+      if constexpr (detail::is_tensor_of_tensor_v<Tensor> &&
+                    detail::is_ta_tensor_v<value_type>) {
+        auto fill = [](typename value_type::value_type* dst,
+                       const typename value_type::value_type* src,
+                       std::size_t n) {
+          for (std::size_t i = 0; i < n; ++i) dst[i] = src[i];
+        };
+        result = detail::arena_trivial_unary<Tensor>(*this, fill);
+      } else if constexpr (is_arena_tensor_v<value_type>) {
+        auto fill = [](typename value_type::value_type* dst,
+                       const typename value_type::value_type* src,
+                       std::size_t n) {
+          for (std::size_t i = 0; i < n; ++i) dst[i] = src[i];
+        };
+        result = detail::arena_trivial_unary<Tensor>(*this, fill);
       } else {
         result = detail::tensor_op<Tensor>(
             [](const numeric_type value) -> numeric_type { return value; },
@@ -1190,11 +1221,20 @@ class Tensor {
     if (!empty) {
       ar & range;
       ar & nbatch;
-      if constexpr (madness::is_input_archive_v<Archive>) {
-        *this = Tensor(std::move(range), nbatch, default_construct{true});
+      if constexpr (is_arena_tensor_v<value_type>) {
+        // ArenaTensor inner cells own no storage themselves; their data
+        // lives in a per-outer-tile arena slab. Bypass the generic
+        // wrap(value_type*, N) path (which would try to serialize bare
+        // Cell* pointers across processes) and manage cell storage at
+        // this outer-tile boundary instead. The slab is rebuilt on load.
+        serialize_arena_inner_cells(ar, std::move(range), nbatch);
+      } else {
+        if constexpr (madness::is_input_archive_v<Archive>) {
+          *this = Tensor(std::move(range), nbatch, default_construct{true});
+        }
+        ar& madness::archive::wrap(this->data_.get(),
+                                   this->range_.volume() * nbatch);
       }
-      ar& madness::archive::wrap(this->data_.get(),
-                                 this->range_.volume() * nbatch);
     } else {
       if constexpr (madness::is_input_archive_v<Archive>) {
         *this = Tensor{};
@@ -1202,6 +1242,60 @@ class Tensor {
     }
   }
 
+ private:
+  /// ArenaTensor-aware inner-cell serialization. Writes per-cell metadata
+  /// (null flag + range) then element bytes; on load, rebuilds the outer
+  /// via `arena_outer_init` so the slab is reconstructed in one
+  /// allocation and the outer-data deleter keeps it alive.
+  template <typename Archive>
+  void serialize_arena_inner_cells(Archive& ar, range_type range,
+                                   std::size_t nbatch) {
+    using InnerT = value_type;
+    using InnerRange = typename InnerT::range_type;
+    const std::size_t N = range.volume() * nbatch;
+    if constexpr (madness::is_output_archive_v<Archive>) {
+      // Per-cell null flags.
+      for (std::size_t i = 0; i < N; ++i) {
+        bool not_null = bool(this->data_.get()[i]);
+        ar & not_null;
+      }
+      // Inner ranges for non-null cells only.
+      for (std::size_t i = 0; i < N; ++i) {
+        const InnerT& cell = this->data_.get()[i];
+        if (cell) ar & cell.range();
+      }
+      // Element bytes for non-null cells only.
+      for (std::size_t i = 0; i < N; ++i) {
+        const InnerT& cell = this->data_.get()[i];
+        if (cell) ar& madness::archive::wrap(cell.data(), cell.size());
+      }
+    } else {
+      // Load: read all metadata, plan + allocate slab via the factory,
+      // then read element bytes into each placed cell's data().
+      std::vector<bool> flags(N);
+      for (std::size_t i = 0; i < N; ++i) {
+        bool f;
+        ar & f;
+        flags[i] = f;
+      }
+      std::vector<InnerRange> ranges(N);
+      for (std::size_t i = 0; i < N; ++i) {
+        if (flags[i]) ar& ranges[i];
+      }
+      *this = detail::arena_outer_init<Tensor>(
+          range, nbatch, [&](std::size_t ord) -> InnerRange {
+            return flags[ord] ? ranges[ord] : InnerRange{};
+          });
+      for (std::size_t i = 0; i < N; ++i) {
+        if (flags[i]) {
+          InnerT& cell = this->data_.get()[i];
+          ar& madness::archive::wrap(cell.data(), cell.size());
+        }
+      }
+    }
+  }
+
+ public:
   /// Swap tensor data
 
   /// \param other The tensor to swap with this
@@ -1441,7 +1535,27 @@ class Tensor {
   template <typename Perm,
             typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
   Tensor permute(const Perm& perm) const {
-    return Tensor(*this, perm);
+    if constexpr (is_arena_tensor_v<value_type>) {
+      // View inner cells cannot be permuted in place; the owning tile
+      // rewrites its slab(s). The outer cells reorder shallowly (the 8-byte
+      // views are reindexed, the slab is shared via keep-alive); a
+      // non-trivial inner permutation rewrites every cell into a fresh slab.
+      // The generic Tensor(other, perm) ctor's allocate-then-fill shape does
+      // not fit the arena slab model, so route around it.
+      const auto outer_perm = outer(perm);
+      Tensor result =
+          (outer_perm && !outer_perm.is_identity())
+              ? detail::arena_permute_shallow<Tensor>(*this, outer_perm)
+              : *this;
+      if constexpr (detail::is_bipartite_permutation_v<Perm>) {
+        const auto inner_perm = inner(perm);
+        if (inner_perm && !inner_perm.is_identity())
+          result = detail::arena_inner_permute<Tensor>(result, inner_perm);
+      }
+      return result;
+    } else {
+      return Tensor(*this, perm);
+    }
   }
 
   /// Shift the lower and upper bound of this tensor
@@ -1680,10 +1794,27 @@ class Tensor {
     // early exit for empty this
     if (empty()) return {};
 
-    return unary([factor](const value_type& a) {
-      using namespace TiledArray::detail;
-      return a * factor;
-    });
+    if constexpr (detail::is_tensor_of_tensor_v<Tensor> &&
+                  detail::is_ta_tensor_v<value_type>) {
+      auto fill = [factor](typename value_type::value_type* dst,
+                           const typename value_type::value_type* src,
+                           std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) dst[i] = src[i] * factor;
+      };
+      return detail::arena_trivial_unary<Tensor>(*this, fill);
+    } else if constexpr (is_arena_tensor_v<value_type>) {
+      auto fill = [factor](typename value_type::value_type* dst,
+                           const typename value_type::value_type* src,
+                           std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) dst[i] = src[i] * factor;
+      };
+      return detail::arena_trivial_unary<Tensor>(*this, fill);
+    } else {
+      return unary([factor](const value_type& a) {
+        using namespace TiledArray::detail;
+        return a * factor;
+      });
+    }
   }
 
   /// Construct a scaled copy of this tensor
@@ -1714,12 +1845,19 @@ class Tensor {
     // early exit for empty this
     if (empty()) return {};
 
-    return unary(
-        [factor](const value_type& a) {
-          using namespace TiledArray::detail;
-          return a * factor;
-        },
-        perm);
+    if constexpr (is_tensor_view_v<value_type>) {
+      TA_EXCEPTION(
+          "Tensor<View>::scale(factor, perm): permutation is not "
+          "supported for view inner cells");
+      return Tensor{};
+    } else {
+      return unary(
+          [factor](const value_type& a) {
+            using namespace TiledArray::detail;
+            return a * factor;
+          },
+          perm);
+    }
   }
 
   /// Scale this tensor
@@ -1739,6 +1877,111 @@ class Tensor {
 
   // Addition operations
 
+  /// Element-wise add for `Tensor<ArenaTensor>` ToT operands. Routes through
+  /// the arena binary kernel; inner cells have no `operator+` of their own.
+  template <typename Right>
+    requires(is_arena_tensor_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type>)
+  Tensor add(const Right& right) const {
+    if (empty()) return detail::clone_or_cast<Tensor>(right);
+    if (right.empty()) return this->clone();
+    auto fill = [](typename value_type::value_type* dst,
+                   const typename value_type::value_type* l,
+                   const typename value_type::value_type* r, std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] + r[i];
+    };
+    return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+  }
+
+  /// Mixed `Tensor<ArenaTensor> + Tensor<scalar>`: each inner element is
+  /// offset by the corresponding outer-cell scalar. Routes through the
+  /// arena scaled kernel; no operator+ between ArenaTensor and scalar.
+  template <typename Right>
+    requires(is_arena_tensor_v<value_type> &&
+             detail::is_numeric_v<typename Right::value_type>)
+  Tensor add(const Right& right) const {
+    if (empty() || right.empty()) return {};
+    using ElemT = typename value_type::value_type;
+    using Scalar = typename Right::value_type;
+    auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s,
+                   std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = arena[i] + s;
+    };
+    return detail::arena_trivial_scaled<Tensor>(*this, right, fill);
+  }
+
+  /// Mixed `Tensor<scalar> + Tensor<ArenaTensor>`: symmetric to above,
+  /// result has the same ToT layout as the right operand.
+  template <typename Right>
+    requires(detail::is_numeric_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type>)
+  Right add(const Right& right) const {
+    if (empty() || right.empty()) return {};
+    using ArenaInner = typename Right::value_type;
+    using ElemT = typename ArenaInner::value_type;
+    using Scalar = value_type;
+    auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s,
+                   std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = s + arena[i];
+    };
+    return detail::arena_trivial_scaled<Right>(right, *this, fill);
+  }
+
+  /// Scaled element-wise add for `Tensor<ArenaTensor>` ToT operands:
+  /// `(this + right) * factor`. Routes through the arena binary kernel.
+  template <typename Right, typename Scalar>
+    requires(is_arena_tensor_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type> &&
+             detail::is_numeric_v<Scalar>)
+  Tensor add(const Right& right, const Scalar factor) const {
+    using ElemT = typename value_type::value_type;
+    auto fill = [factor](ElemT* dst, const ElemT* l, const ElemT* r,
+                         std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = (l[i] + r[i]) * factor;
+    };
+    return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+  }
+
+  /// True if \p perm reorders nothing -- empty or identity. Handles a plain
+  /// Permutation and a (bipartite) ToT permutation alike.
+  template <typename Perm>
+  static bool arena_perm_is_trivial(const Perm& perm) {
+    if constexpr (std::is_same_v<Perm, BipartitePermutation>)
+      return !static_cast<bool>(perm) ||
+             (perm.first().is_identity() && perm.second().is_identity());
+    else
+      return !static_cast<bool>(perm) || perm.is_identity();
+  }
+
+  /// Permuted add for `Tensor<ArenaTensor>` ToT operands. A non-trivial
+  /// permutation of arena ToT tiles is not yet supported; an identity (or
+  /// null) permutation falls through to the plain element-wise add.
+  template <typename Right, typename Perm>
+    requires(is_arena_tensor_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type> &&
+             detail::is_permutation_v<Perm>)
+  Tensor add(const Right& right, const Perm& perm) const {
+    if (!arena_perm_is_trivial(perm))
+      TA_EXCEPTION(
+          "TA::Tensor<ArenaTensor>::add: permuted add of a tensor-of-tensors "
+          "is not yet supported");
+    return add(right);
+  }
+
+  /// Permuted scaled add for `Tensor<ArenaTensor>` ToT operands; see the
+  /// permuted-add overload above for the permutation restriction.
+  template <typename Right, typename Scalar, typename Perm>
+    requires(is_arena_tensor_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type> &&
+             detail::is_numeric_v<Scalar> && detail::is_permutation_v<Perm>)
+  Tensor add(const Right& right, const Scalar factor, const Perm& perm) const {
+    if (!arena_perm_is_trivial(perm))
+      TA_EXCEPTION(
+          "TA::Tensor<ArenaTensor>::add: permuted scaled add of a "
+          "tensor-of-tensors is not yet supported");
+    return add(right, factor);
+  }
+
   /// Add this and \c other to construct a new tensor
 
   /// \tparam Right The right-hand tensor type
@@ -1748,7 +1991,11 @@ class Tensor {
   template <typename Right>
     requires(is_tensor<Right>::value &&
              detail::sum_convertible_to<value_type, const value_type&,
-                                        const value_t<Right>&>)
+                                        const value_t<Right>&> &&
+             !(is_arena_tensor_v<value_type> &&
+               detail::is_numeric_v<typename Right::value_type>) &&
+             !(detail::is_numeric_v<value_type> &&
+               is_arena_tensor_v<typename Right::value_type>))
   Tensor add(const Right& right) const {
     // early exit for empty right
     if (right.empty()) return this->clone();
@@ -1756,24 +2003,35 @@ class Tensor {
     // early exit for empty this
     if (empty()) detail::clone_or_cast<Tensor>(right);
 
-    return binary(
-        right,
-        [](const value_type& l, const value_t<Right>& r) -> decltype(l + r) {
-          if constexpr (detail::is_tensor_v<value_type>) {
-            if (l.empty()) {
-              if (r.empty())
-                return {};
-              else
-                return r.clone();
-            } else {
-              if (r.empty())
-                return l.clone();
-              else
-                return l + r;
+    if constexpr (detail::is_tensor_of_tensor_v<Tensor> &&
+                  detail::is_ta_tensor_v<value_type> &&
+                  detail::is_ta_tensor_v<typename Right::value_type>) {
+      auto fill = [](typename value_type::value_type* dst,
+                     const typename value_type::value_type* l,
+                     const typename value_type::value_type* r, std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] + r[i];
+      };
+      return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+    } else {
+      return binary(
+          right,
+          [](const value_type& l, const value_t<Right>& r) -> decltype(l + r) {
+            if constexpr (detail::is_tensor_v<value_type>) {
+              if (l.empty()) {
+                if (r.empty())
+                  return {};
+                else
+                  return r.clone();
+              } else {
+                if (r.empty())
+                  return l.clone();
+                else
+                  return l + r;
+              }
             }
-          }
-          return l + r;
-        });
+            return l + r;
+          });
+    }
   }
 
   /// Add this and \c other to construct a new tensor
@@ -1800,7 +2058,13 @@ class Tensor {
   template <typename Right>
     requires(detail::is_tensor_v<Right> &&
              !detail::sum_convertible_to<value_type, const value_type&,
-                                         const value_t<Right>&>)
+                                         const value_t<Right>&> &&
+             !(is_arena_tensor_v<value_type> &&
+               is_arena_tensor_v<typename Right::value_type>) &&
+             !(is_arena_tensor_v<value_type> &&
+               detail::is_numeric_v<typename Right::value_type>) &&
+             !(detail::is_numeric_v<value_type> &&
+               is_arena_tensor_v<typename Right::value_type>))
   auto add(const Right& right) const {
     return binary(right, [](const value_type& l, const value_t<Right>& r) {
       return l + r;
@@ -1932,6 +2196,74 @@ class Tensor {
                         const value_t<Right> r) { (l += r) *= factor; });
   }
 
+  /// axpy: <tt>result[i] += arg[i] * factor</tt> (factor scales only the
+  /// added operand, not the existing result). Distinct from
+  /// `add_to(arg, factor)` which has the legacy `(result + arg) * factor`
+  /// semantics. Useful as a fused replacement for
+  /// `add_to(result, scale(arg, factor))` when the intermediate
+  /// materialization is undesirable (e.g. when `value_type` is a view).
+  ///
+  /// The lambda body dispatches by element type so the same body works
+  /// for flat and ToT tensors -- at the leaf (scalar) level it uses
+  /// `l += r * factor`; at the cell level it delegates to the cell's
+  /// `axpy_to` member (free or member, found via ADL).
+  template <typename Right, typename Scalar>
+    requires(is_tensor<Right>::value && detail::is_numeric_v<Scalar>)
+  Tensor& axpy_to(const Right& right, const Scalar factor) {
+    if (right.empty()) return *this;
+    if (empty()) {
+      *this = detail::clone_or_cast<Tensor>(right);
+      this->scale_to(factor);
+      return *this;
+    }
+    return inplace_binary(right,
+                          [factor](auto& MADNESS_RESTRICT l, const auto& r) {
+                            using L = std::remove_reference_t<decltype(l)>;
+                            if constexpr (detail::is_tensor_helper<L>::value) {
+                              l.axpy_to(r, factor);
+                            } else {
+                              l += r * factor;
+                            }
+                          });
+  }
+
+  /// axpy with fused permutation on the added operand:
+  /// <tt>result[i] += (perm ^ arg)[i] * factor</tt>.
+  ///
+  /// Bails for view inner cells (which cannot be permuted in place).
+  template <typename Right, typename Scalar, typename Perm>
+    requires(is_tensor<Right>::value && detail::is_numeric_v<Scalar> &&
+             detail::is_permutation_v<Perm>)
+  Tensor& axpy_to(const Right& right, const Scalar factor, const Perm& perm) {
+    if (right.empty()) return *this;
+    if constexpr (is_tensor_view_v<value_type>) {
+      TA_EXCEPTION(
+          "Tensor<View>::axpy_to(right, factor, perm): inner "
+          "permutation is not supported for view inner cells");
+      return *this;
+    } else {
+      auto permuted = right.permute(perm);
+      if (empty()) {
+        // first contribution into an unallocated target (e.g. a contraction
+        // result inner cell): initialize to factor * (perm ^ arg) rather
+        // than asserting non-empty in inplace_binary -- mirrors the
+        // non-permuting axpy_to overload above.
+        *this = detail::clone_or_cast<Tensor>(permuted);
+        this->scale_to(factor);
+        return *this;
+      }
+      return inplace_binary(
+          permuted, [factor](auto& MADNESS_RESTRICT l, const auto& r) {
+            using L = std::remove_reference_t<decltype(l)>;
+            if constexpr (detail::is_tensor_helper<L>::value) {
+              l.axpy_to(r, factor);
+            } else {
+              l += r * factor;
+            }
+          });
+    }
+  }
+
   /// Add a constant to this tensor
 
   /// \param value The constant to be added
@@ -1948,33 +2280,93 @@ class Tensor {
 
   /// Subtract \c right from this and return the result
 
+  /// Element-wise subtraction for `Tensor<ArenaTensor>` ToT operands. Routes
+  /// through the arena binary kernel; inner cells have no `operator-`.
+  template <typename Right>
+    requires(is_arena_tensor_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type>)
+  Tensor subt(const Right& right) const {
+    auto fill = [](typename value_type::value_type* dst,
+                   const typename value_type::value_type* l,
+                   const typename value_type::value_type* r, std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] - r[i];
+    };
+    return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+  }
+
+  /// Mixed `Tensor<ArenaTensor> - Tensor<scalar>`: subtract per-cell scalar
+  /// from every inner element. Routes through the arena scaled kernel.
+  template <typename Right>
+    requires(is_arena_tensor_v<value_type> &&
+             detail::is_numeric_v<typename Right::value_type>)
+  Tensor subt(const Right& right) const {
+    if (empty() || right.empty()) return {};
+    using ElemT = typename value_type::value_type;
+    using Scalar = typename Right::value_type;
+    auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s,
+                   std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = arena[i] - s;
+    };
+    return detail::arena_trivial_scaled<Tensor>(*this, right, fill);
+  }
+
+  /// Mixed `Tensor<scalar> - Tensor<ArenaTensor>`: for each outer cell,
+  /// broadcast the scalar minus each inner element of the arena side.
+  template <typename Right>
+    requires(detail::is_numeric_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type>)
+  Right subt(const Right& right) const {
+    if (empty() || right.empty()) return {};
+    using ArenaInner = typename Right::value_type;
+    using ElemT = typename ArenaInner::value_type;
+    using Scalar = value_type;
+    auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s,
+                   std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = s - arena[i];
+    };
+    return detail::arena_trivial_scaled<Right>(right, *this, fill);
+  }
+
   /// \tparam Right The right-hand tensor type
   /// \param right The tensor that will be subtracted from this tensor
   /// \return A new tensor where the elements are the different between the
   /// elements of \c this and \c right
   template <typename Right,
             typename = std::enable_if_t<
-                detail::tensors_have_equal_nested_rank_v<Tensor, Right>>>
+                detail::tensors_have_equal_nested_rank_v<Tensor, Right> &&
+                !(is_arena_tensor_v<value_type> &&
+                  is_arena_tensor_v<typename Right::value_type>)>>
   Tensor subt(const Right& right) const {
-    return binary(
-        right,
-        [](const value_type& l, const value_t<Right>& r) -> decltype(l - r) {
-          if constexpr (detail::is_tensor_v<value_type>) {
-            if (l.empty()) {
-              if (r.empty())
-                return {};
-              else
-                return -r;
+    if constexpr (detail::is_tensor_of_tensor_v<Tensor> &&
+                  detail::is_ta_tensor_v<value_type> &&
+                  detail::is_ta_tensor_v<typename Right::value_type>) {
+      auto fill = [](typename value_type::value_type* dst,
+                     const typename value_type::value_type* l,
+                     const typename value_type::value_type* r, std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] - r[i];
+      };
+      return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+    } else {
+      return binary(
+          right,
+          [](const value_type& l, const value_t<Right>& r) -> decltype(l - r) {
+            if constexpr (detail::is_tensor_v<value_type>) {
+              if (l.empty()) {
+                if (r.empty())
+                  return {};
+                else
+                  return -r;
+              } else {
+                if (r.empty())
+                  return l.clone();
+                else
+                  return l - r;
+              }
             } else {
-              if (r.empty())
-                return l.clone();
-              else
-                return l - r;
+              return l - r;
             }
-          } else {
-            return l - r;
-          }
-        });
+          });
+    }
   }
 
   /// Subtract \c right from this and return the result permuted by \c perm
@@ -1990,9 +2382,18 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor subt(const Right& right, const Perm& perm) const {
-    return binary(
-        right, [](const value_type& l, const value_type& r) { return l - r; },
-        perm);
+    if constexpr (is_tensor_view_v<value_type>) {
+      // Permutation isn't supported for view inner cells (fixed storage
+      // layout). Subt+permute would require materialization.
+      TA_EXCEPTION(
+          "Tensor<View>::subt(right, perm): permutation is not "
+          "supported for view inner cells");
+      return Tensor{};
+    } else {
+      return binary(
+          right, [](const value_type& l, const value_type& r) { return l - r; },
+          perm);
+    }
   }
 
   /// Subtract \c right from this and return the result scaled by a scaling \c
@@ -2009,9 +2410,19 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor subt(const Right& right, const Scalar factor) const {
-    return binary(right, [factor](const value_type& l, const value_type& r) {
-      return (l - r) * factor;
-    });
+    if constexpr (is_arena_tensor_v<value_type> &&
+                  is_arena_tensor_v<typename Right::value_type>) {
+      using ElemT = typename value_type::value_type;
+      auto fill = [factor](ElemT* dst, const ElemT* l, const ElemT* r,
+                           std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) dst[i] = (l[i] - r[i]) * factor;
+      };
+      return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+    } else {
+      return binary(right, [factor](const value_type& l, const value_type& r) {
+        return (l - r) * factor;
+      });
+    }
   }
 
   /// Subtract \c right from this and return the result scaled by a scaling \c
@@ -2030,12 +2441,21 @@ class Tensor {
                 is_tensor<Right>::value && detail::is_numeric_v<Scalar> &&
                 detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor subt(const Right& right, const Scalar factor, const Perm& perm) const {
-    return binary(
-        right,
-        [factor](const value_type& l, const value_type& r) {
-          return (l - r) * factor;
-        },
-        perm);
+    if constexpr (is_arena_tensor_v<value_type> &&
+                  is_arena_tensor_v<typename Right::value_type>) {
+      if (!arena_perm_is_trivial(perm))
+        TA_EXCEPTION(
+            "TA::Tensor<ArenaTensor>::subt: permuted scaled subt of a "
+            "tensor-of-tensors is not yet supported");
+      return subt(right, factor);
+    } else {
+      return binary(
+          right,
+          [factor](const value_type& l, const value_type& r) {
+            return (l - r) * factor;
+          },
+          perm);
+    }
   }
 
   /// Subtract a constant from a copy of this tensor
@@ -2108,9 +2528,60 @@ class Tensor {
   /// \param right The tensor that will be multiplied by this tensor
   /// \return A new tensor where the elements are the product of the elements
   /// of \c this and \c right
-  template <typename Right,
-            typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
-                nullptr>
+  /// Element-wise mult for `Tensor<ArenaTensor>` ToT operands. Routes
+  /// through the arena binary kernel; inner cells have no `operator*`.
+  template <typename Right>
+    requires(is_arena_tensor_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type>)
+  Tensor mult(const Right& right) const {
+    if (empty() || right.empty()) return {};
+    auto fill = [](typename value_type::value_type* dst,
+                   const typename value_type::value_type* l,
+                   const typename value_type::value_type* r, std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] * r[i];
+    };
+    return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+  }
+
+  /// Mixed `Tensor<ArenaTensor> * Tensor<scalar>`: outer Hadamard, each
+  /// inner cell scaled by the corresponding scalar. Routes through the
+  /// arena scaled kernel; no operator* between ArenaTensor and scalar.
+  template <typename Right>
+    requires(is_arena_tensor_v<value_type> &&
+             detail::is_numeric_v<typename Right::value_type>)
+  Tensor mult(const Right& right) const {
+    if (empty() || right.empty()) return {};
+    using ElemT = typename value_type::value_type;
+    using Scalar = typename Right::value_type;
+    auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s,
+                   std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = arena[i] * s;
+    };
+    return detail::arena_trivial_scaled<Tensor>(*this, right, fill);
+  }
+
+  /// Mixed `Tensor<scalar> * Tensor<ArenaTensor>`: symmetric to above,
+  /// result has the same ToT layout as the right operand.
+  template <typename Right>
+    requires(detail::is_numeric_v<value_type> &&
+             is_arena_tensor_v<typename Right::value_type>)
+  Right mult(const Right& right) const {
+    if (empty() || right.empty()) return {};
+    using ArenaInner = typename Right::value_type;
+    using ElemT = typename ArenaInner::value_type;
+    using Scalar = value_type;
+    auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s,
+                   std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = s * arena[i];
+    };
+    return detail::arena_trivial_scaled<Right>(right, *this, fill);
+  }
+
+  template <
+      typename Right,
+      typename std::enable_if<
+          detail::is_nested_tensor_v<Right> && !is_arena_tensor_v<value_type> &&
+          !is_arena_tensor_v<typename Right::value_type>>::type* = nullptr>
   decltype(auto) mult(const Right& right) const {
     auto mult_op = [](const value_type& l, const value_t<Right>& r) {
       return l * r;
@@ -2122,7 +2593,18 @@ class Tensor {
       return res_t{};
     }
 
-    return binary(right, mult_op);
+    if constexpr (detail::is_tensor_of_tensor_v<Tensor> &&
+                  detail::is_ta_tensor_v<value_type> &&
+                  detail::is_ta_tensor_v<typename Right::value_type>) {
+      auto fill = [](typename value_type::value_type* dst,
+                     const typename value_type::value_type* l,
+                     const typename value_type::value_type* r, std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] * r[i];
+      };
+      return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+    } else {
+      return binary(right, mult_op);
+    }
   }
 
   /// Multiply this by \c right to create a new, permuted tensor
@@ -2138,10 +2620,33 @@ class Tensor {
       typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   decltype(auto) mult(const Right& right, const Perm& perm) const {
-    return binary(
-        right,
-        [](const value_type& l, const value_t<Right>& r) { return l * r; },
-        perm);
+    if constexpr (is_arena_tensor_v<value_type> &&
+                  is_arena_tensor_v<typename Right::value_type>) {
+      if (!arena_perm_is_trivial(perm))
+        TA_EXCEPTION(
+            "TA::Tensor<ArenaTensor>::mult: permuted mult of a "
+            "tensor-of-tensors is not yet supported");
+      return mult(right);
+    } else if constexpr (detail::is_numeric_v<value_type> &&
+                         is_arena_tensor_v<typename Right::value_type>) {
+      // t x tot: a plain scalar tile times an arena ToT tile. The 2-arg
+      // arena overload scales each inner cell into a fresh slab; a
+      // non-trivial result permutation is then a shallow outer reindex of
+      // that slab (the inner part is identity for a Hadamard t x tot).
+      auto result = mult(right);
+      return arena_perm_is_trivial(perm) ? result : result.permute(perm);
+    } else if constexpr (is_arena_tensor_v<value_type> &&
+                         detail::is_numeric_v<typename Right::value_type>) {
+      // tot x t: the mirror of the above -- an arena ToT tile times a plain
+      // scalar tile. Same slab-then-reindex handling.
+      auto result = mult(right);
+      return arena_perm_is_trivial(perm) ? result : result.permute(perm);
+    } else {
+      return binary(
+          right,
+          [](const value_type& l, const value_t<Right>& r) { return l * r; },
+          perm);
+    }
   }
 
   /// Scale and multiply this by \c right to create a new tensor
@@ -2157,10 +2662,20 @@ class Tensor {
       typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   decltype(auto) mult(const Right& right, const Scalar factor) const {
-    return binary(right,
-                  [factor](const value_type& l, const value_t<Right>& r) {
-                    return (l * r) * factor;
-                  });
+    if constexpr (is_arena_tensor_v<value_type> &&
+                  is_arena_tensor_v<typename Right::value_type>) {
+      using ElemT = typename value_type::value_type;
+      auto fill = [factor](ElemT* dst, const ElemT* l, const ElemT* r,
+                           std::size_t n) {
+        for (std::size_t i = 0; i < n; ++i) dst[i] = (l[i] * r[i]) * factor;
+      };
+      return detail::arena_trivial_binary<Tensor>(*this, right, fill);
+    } else {
+      return binary(right,
+                    [factor](const value_type& l, const value_t<Right>& r) {
+                      return (l * r) * factor;
+                    });
+    }
   }
 
   /// Scale and multiply this by \c right to create a new, permuted tensor
@@ -2180,12 +2695,21 @@ class Tensor {
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   decltype(auto) mult(const Right& right, const Scalar factor,
                       const Perm& perm) const {
-    return binary(
-        right,
-        [factor](const value_type& l, const value_t<Right>& r) {
-          return (l * r) * factor;
-        },
-        perm);
+    if constexpr (is_arena_tensor_v<value_type> &&
+                  is_arena_tensor_v<typename Right::value_type>) {
+      if (!arena_perm_is_trivial(perm))
+        TA_EXCEPTION(
+            "TA::Tensor<ArenaTensor>::mult: permuted scaled mult of a "
+            "tensor-of-tensors is not yet supported");
+      return mult(right, factor);
+    } else {
+      return binary(
+          right,
+          [factor](const value_type& l, const value_t<Right>& r) {
+            return (l * r) * factor;
+          },
+          perm);
+    }
   }
 
   /// Multiply this tensor by \c right
@@ -2239,7 +2763,13 @@ class Tensor {
     // early exit for empty this
     if (empty()) return this->clone();
 
-    return unary([](const value_type r) { return -r; });
+    if constexpr (is_arena_tensor_v<value_type>) {
+      Tensor result = this->clone();
+      result.scale_to(numeric_type(-1));
+      return result;
+    } else {
+      return unary([](const value_type r) { return -r; });
+    }
   }
 
   /// Create a negated and permuted copy of this tensor
@@ -2253,7 +2783,16 @@ class Tensor {
     // early exit for empty this
     if (empty()) return this->clone();
 
-    return unary([](const value_type l) { return -l; }, perm);
+    if constexpr (is_tensor_view_v<value_type>) {
+      // View cells cannot be permuted in place (size-fixed); permute is
+      // intentionally not supported here.
+      TA_EXCEPTION(
+          "Tensor<View>::neg(perm): permutation is not supported "
+          "for view inner cells");
+      return Tensor{};
+    } else {
+      return unary([](const value_type l) { return -l; }, perm);
+    }
   }
 
   /// Negate elements of this tensor
@@ -2263,7 +2802,11 @@ class Tensor {
     // early exit for empty this
     if (empty()) return *this;
 
-    return inplace_unary([](value_type& MADNESS_RESTRICT l) { l = -l; });
+    if constexpr (is_tensor_view_v<value_type>) {
+      return this->scale_to(numeric_type(-1));
+    } else {
+      return inplace_unary([](value_type& MADNESS_RESTRICT l) { l = -l; });
+    }
   }
 
   /// Create a complex conjugated copy of this tensor
@@ -2799,6 +3342,19 @@ class Tensor {
 
 };  // class Tensor
 
+/// \return the number of bytes an `ArenaTensor` view plus its in-arena cell
+/// occupy in memory space `S`. `size_of(Tensor<ArenaTensor>)` recurses here
+/// once per inner cell; summed over the outer tile this counts the slab.
+template <MemorySpace S, typename T, typename R>
+std::size_t size_of(const ArenaTensor<T, R>& t) {
+  std::size_t result = 0;
+  if constexpr (S == MemorySpace::Host) {
+    result += sizeof(t);  // the one-pointer view itself
+    if (!t.empty()) result += ArenaTensor<T, R>::cell_size(t.size());
+  }
+  return result;
+}
+
 /// \return the number of bytes used by \p t in memory space
 /// `S`
 template <MemorySpace S, typename T, typename A>
diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h
index ebc04ebe23..1a457e61b1 100644
--- a/src/TiledArray/tensor/type_traits.h
+++ b/src/TiledArray/tensor/type_traits.h
@@ -117,18 +117,51 @@ struct is_nested_tensor<T1, T2, Ts...> {
 template <typename... Ts>
 inline constexpr const bool is_nested_tensor_v = is_nested_tensor<Ts...>::value;
 
+}  // namespace detail
+
+/// Forward decl for the tensor-view predicate. Specializations live in
+/// `tensor/arena_tensor.h` (`ArenaTensor`, `detail::TensorInterface`) and
+/// `external/btas.h` (`btas::TensorView`). Declared here so the operator-body
+/// predicates below can consult it without including arena_tensor.h.
+template <typename T>
+struct is_tensor_view : std::false_type {};
+template <typename T>
+inline constexpr bool is_tensor_view_v = is_tensor_view<T>::value;
+
+namespace detail {
+
 /// Predicate used by the shared operator body in
-/// @c TiledArray/tensor/operators_body.ipp to gate the element-wise tensor
-/// operators that are injected into @c namespace TiledArray . The btas-side
-/// copy of the same operators (in @c external/btas.h) partial-specializes
-/// this predicate to @c std::false_type for @c btas::Tensor so the two
-/// namespaces' operators stay non-overlapping under ADL.
+/// @c TiledArray/tensor/operators_body.ipp to gate the **value-returning**
+/// element-wise tensor operators (@c +, @c -, @c *, @c neg) that are
+/// injected into @c namespace TiledArray. These ops produce a *new* tensor
+/// and so are only valid for *freestanding* (owning) tensor types -- a view
+/// like `ArenaTensor` cannot allocate on its own.
+///
+/// The btas-side copy of the same operators (in @c external/btas.h)
+/// partial-specializes this predicate to @c std::false_type for @c
+/// btas::Tensor so the two namespaces' operators stay non-overlapping under
+/// ADL.
 template <typename T>
-struct ta_ops_match_tensor : is_nested_tensor<T> {};
+struct ta_ops_match_tensor
+    : std::bool_constant<is_nested_tensor<T>::value && !is_tensor_view_v<T>> {};
 
 template <typename T>
 inline constexpr bool ta_ops_match_tensor_v = ta_ops_match_tensor<T>::value;
 
+/// Predicate used by the operator body to gate the **compound-assignment**
+/// (in-place) operators (@c +=, @c -=, @c *=). Mutating ops don't allocate,
+/// so they're valid for any tensor whose storage we can mutate -- including
+/// views. By default this is the freestanding predicate union'd with the
+/// tensor-view predicate; the btas-side copy specializes it the same way it
+/// does for the value-returning one.
+template <typename T>
+struct ta_ops_match_tensor_inplace
+    : std::bool_constant<is_nested_tensor<T>::value> {};
+
+template <typename T>
+inline constexpr bool ta_ops_match_tensor_inplace_v =
+    ta_ops_match_tensor_inplace<T>::value;
+
 ////////////////////////////////////////////////////////////////////////////////
 
 template <typename T, typename Enabler = void>
@@ -478,6 +511,36 @@ constexpr bool is_annotation_v<
 
     >{true};
 
+// Detect whether T exposes a `rebind_t<U>` member template. Owning tensor
+// families (TA::Tensor, btas::Tensor) do; view types (TensorInterface,
+// ShiftWrapper, ArenaTensor) do not.
+template <typename T, typename U, typename = void>
+struct has_rebind_t : std::false_type {};
+template <typename T, typename U>
+struct has_rebind_t<T, U, std::void_t<typename T::template rebind_t<U>>>
+    : std::true_type {};
+
+/// The default freestanding (owning) tensor type associated with tensor type
+/// `T` -- the type a value-returning op must produce when handed a `T`.
+///
+/// This is purely the *view -> owning-tensor* map; rebinding the element
+/// type is a separate concern (`rebind_t`). A tensor that is already
+/// freestanding (exposes `rebind_t`, as `TA::Tensor`/`btas::Tensor` do) maps
+/// to itself. A *view* type (`ArenaTensor`, `TensorInterface`, ...) cannot be
+/// a value result and maps to the owning `TA::Tensor<T::value_type>`. A view
+/// may specialize this trait to name a different owning family (e.g.
+/// `btas::TensorView` -> `btas::Tensor`). The mapped type is always
+/// freestanding and therefore always exposes `rebind_t`.
+template <typename T, typename = void>
+struct default_freestanding_tensor {
+  using type =
+      std::conditional_t<has_rebind_t<T, typename T::value_type>::value, T,
+                         Tensor<typename T::value_type>>;
+};
+template <typename T>
+using default_freestanding_tensor_t =
+    typename default_freestanding_tensor<T>::type;
+
 namespace {
 
 template <typename Op, typename Lhs, typename Rhs>
@@ -490,15 +553,6 @@ template <typename Op, typename Lhs, typename Rhs>
 constexpr bool
     is_binop_v<Op, Lhs, Rhs, std::void_t<binop_result_t<Op, Lhs, Rhs>>>{true};
 
-// Detect whether T exposes a `rebind_t<U>` member template. Both TA::Tensor
-// and btas::Tensor do; view types like TensorInterface and ShiftWrapper do
-// not, so callers must fall back to a concrete tensor for the result type.
-template <typename T, typename U, typename = void>
-struct has_rebind_t : std::false_type {};
-template <typename T, typename U>
-struct has_rebind_t<T, U, std::void_t<typename T::template rebind_t<U>>>
-    : std::true_type {};
-
 template <typename Op, typename TensorA, typename TensorB,
           typename Allocator = void,
           typename = std::enable_if_t<is_nested_tensor_v<TensorA, TensorB>>>
@@ -512,19 +566,18 @@ struct result_tensor_helper {
  public:
   using numeric_type = binop_result_t<Op, value_type_A, value_type_B>;
 
-  // Result tensor type stays in TensorA's family with the allocator rebound to
-  // hold `numeric_type`. TA::Tensor and btas::Tensor expose this as
-  // `rebind_t<U>` (TA::Tensor via std::allocator_traits::rebind_alloc; btas
-  // via storage_traits::rebind_t). View types (TensorInterface, ShiftWrapper)
-  // satisfy is_tensor_v but have no `rebind_t` — fall back to TA::Tensor for
-  // those. An explicit @tparam Allocator override only applies when TensorA
-  // is a TA::Tensor.
-  using result_type = std::conditional_t<
-      std::is_same_v<void, Allocator> || !is_ta_tensor_v<TensorA_>,
-      std::conditional_t<has_rebind_t<TensorA_, numeric_type>::value,
-                         typename TensorA_::template rebind_t<numeric_type>,
-                         TA::Tensor<numeric_type>>,
-      TA::Tensor<numeric_type, Allocator>>;
+  // Result tensor type stays in TensorA's *freestanding* family -- TensorA
+  // itself if already owning, or its owning counterpart if TensorA is a view
+  // (see `default_freestanding_tensor`) -- with the allocator rebound to hold
+  // `numeric_type`. The freestanding type always exposes `rebind_t`. An
+  // explicit @tparam Allocator override only applies when TensorA is a
+  // TA::Tensor.
+  using result_type =
+      std::conditional_t<std::is_same_v<void, Allocator> ||
+                             !is_ta_tensor_v<TensorA_>,
+                         typename default_freestanding_tensor_t<
+                             TensorA_>::template rebind_t<numeric_type>,
+                         TA::Tensor<numeric_type, Allocator>>;
 };
 
 }  // namespace
diff --git a/src/TiledArray/tile_interface/add.h b/src/TiledArray/tile_interface/add.h
index ced9987d45..92bf366026 100644
--- a/src/TiledArray/tile_interface/add.h
+++ b/src/TiledArray/tile_interface/add.h
@@ -178,6 +178,35 @@ inline decltype(auto) add_to(Result&& result, const Arg& arg,
   return std::forward<Result>(result).add_to(arg, factor);
 }
 
+/// axpy into the result tile: <tt>result[i] += arg[i] * factor</tt>.
+/// Distinct from `add_to(result, arg, factor)` which has the legacy
+/// `(result + arg) * factor` semantics; this one scales only the added
+/// operand. Use this in fused-accumulation paths (e.g. an einsum loop
+/// computing `out += arg * scalar`) where allocating a scaled temporary
+/// would be either wasteful or impossible (e.g. for view tile types that
+/// lack value-returning `scale`).
+template <typename Result, typename Arg, typename Scalar,
+          typename std::enable_if<
+              detail::is_numeric_v<Scalar> &&
+              detail::has_member_function_axpy_to_anyreturn_v<
+                  Result&&, const Arg&, const Scalar>>::type* = nullptr>
+inline decltype(auto) axpy_to(Result&& result, const Arg& arg,
+                              const Scalar factor) {
+  return std::forward<Result>(result).axpy_to(arg, factor);
+}
+
+/// axpy + fused permutation: <tt>result[i] += (perm ^ arg)[i] * factor</tt>.
+template <
+    typename Result, typename Arg, typename Scalar, typename Perm,
+    typename std::enable_if<
+        detail::is_numeric_v<Scalar> && detail::is_permutation_v<Perm> &&
+        detail::has_member_function_axpy_to_anyreturn_v<
+            Result&&, const Arg&, const Scalar, const Perm&>>::type* = nullptr>
+inline decltype(auto) axpy_to(Result&& result, const Arg& arg,
+                              const Scalar factor, const Perm& perm) {
+  return std::forward<Result>(result).axpy_to(arg, factor, perm);
+}
+
 namespace tile_interface {
 
 using TiledArray::add;
diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h
index 2a5e90ea5d..e599110d49 100644
--- a/src/TiledArray/tile_op/contract_reduce.h
+++ b/src/TiledArray/tile_op/contract_reduce.h
@@ -28,9 +28,13 @@
 
 #include <TiledArray/math/gemm_helper.h>
 #include <TiledArray/permutation.h>
+#include <TiledArray/tensor/arena_einsum.h>
 #include <TiledArray/tensor/complex.h>
 #include <TiledArray/tile_op/tile_interface.h>
 #include <TiledArray/util/function.h>
+#include <optional>
+#include <type_traits>
+#include <variant>
 #include "../tile_interface/add.h"
 #include "../tile_interface/permute.h"
 
@@ -81,23 +85,35 @@ class ContractReduceBase {
 
  private:
   struct Impl {
+    using left_tile_type =
+        std::remove_cv_t<std::remove_reference_t<first_argument_type>>;
+    using right_tile_type =
+        std::remove_cv_t<std::remove_reference_t<second_argument_type>>;
+    using arena_plan_storage_t =
+        TiledArray::detail::arena_plan_storage_t<result_type, left_tile_type,
+                                                 right_tile_type>;
+
     template <
         typename Perm = BipartitePermutation,
         typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
+        typename Plan = arena_plan_storage_t,
         typename = std::enable_if_t<
             TiledArray::detail::is_permutation_v<
                 std::remove_reference_t<Perm>> &&
             std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                   result_value_type&, const left_value_type&,
-                                  const right_value_type&>>>
+                                  const right_value_type&> &&
+            std::is_same_v<std::decay_t<Plan>, arena_plan_storage_t>>>
     Impl(const math::blas::Op left_op, const math::blas::Op right_op,
          const scalar_type alpha, const unsigned int result_rank,
          const unsigned int left_rank, const unsigned int right_rank,
-         Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+         Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {},
+         Plan&& arena_plan_in = {})
         : gemm_helper_(left_op, right_op, result_rank, left_rank, right_rank),
           alpha_(alpha),
           perm_(std::forward<Perm>(perm)),
-          elem_muladd_op_(std::forward<ElemMultAddOp>(elem_muladd_op)) {
+          elem_muladd_op_(std::forward<ElemMultAddOp>(elem_muladd_op)),
+          arena_plan_(std::forward<Plan>(arena_plan_in)) {
       // non-unit alpha must be absorbed into elem_muladd_op
       if (elem_muladd_op_) TA_ASSERT(alpha == scalar_type(1));
     }
@@ -111,6 +127,8 @@ class ContractReduceBase {
     /// type-erased reference to custom element multiply-add op
     /// \note the lifetime is managed by the callee!
     TiledArray::function_ref<elem_muladd_op_type> elem_muladd_op_;
+
+    TA_NO_UNIQUE_ADDRESS arena_plan_storage_t arena_plan_;
   };
 
   std::shared_ptr<Impl> pimpl_;
@@ -125,6 +143,8 @@ class ContractReduceBase {
   ContractReduceBase_& operator=(const ContractReduceBase_&) = default;
   ContractReduceBase_& operator=(ContractReduceBase_&&) = default;
 
+  using arena_plan_storage_t = typename Impl::arena_plan_storage_t;
+
   /// Construct contract/reduce functor
 
   /// \tparam Perm a permutation type
@@ -141,21 +161,26 @@ class ContractReduceBase {
   template <
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
+      typename Plan = typename Impl::arena_plan_storage_t,
       typename = std::enable_if_t<
           TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
-                                const right_value_type&>>>
+                                const right_value_type&> &&
+          std::is_same_v<std::decay_t<Plan>,
+                         typename Impl::arena_plan_storage_t>>>
   ContractReduceBase(const math::blas::Op left_op,
                      const math::blas::Op right_op, const scalar_type alpha,
                      const unsigned int result_rank,
                      const unsigned int left_rank,
                      const unsigned int right_rank, Perm&& perm = {},
-                     ElemMultAddOp&& elem_muladd_op = {})
+                     ElemMultAddOp&& elem_muladd_op = {},
+                     Plan&& arena_plan_in = {})
       : pimpl_(std::make_shared<Impl>(
             left_op, right_op, alpha, result_rank, left_rank, right_rank,
             std::forward<Perm>(perm),
-            std::forward<ElemMultAddOp>(elem_muladd_op))) {}
+            std::forward<ElemMultAddOp>(elem_muladd_op),
+            std::forward<Plan>(arena_plan_in))) {}
 
   /// Gemm meta data accessor
 
@@ -189,6 +214,14 @@ class ContractReduceBase {
     return pimpl_->elem_muladd_op_;
   }
 
+  /// Arena plan accessor
+
+  /// \return A const reference to the arena plan storage
+  const auto& arena_plan() const {
+    TA_ASSERT(pimpl_);
+    return pimpl_->arena_plan_;
+  }
+
   //-------------- these are only used for unit tests -----------------
 
   /// Compute the number of contracted ranks
@@ -277,18 +310,23 @@ class ContractReduce : public ContractReduceBase<Result, Left, Right, Scalar> {
   template <
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
+      typename Plan = typename ContractReduceBase_::arena_plan_storage_t,
       typename = std::enable_if_t<
           TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
-                                const right_value_type&>>>
+                                const right_value_type&> &&
+          std::is_same_v<std::decay_t<Plan>,
+                         typename ContractReduceBase_::arena_plan_storage_t>>>
   ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op,
                  const scalar_type alpha, const unsigned int result_rank,
                  const unsigned int left_rank, const unsigned int right_rank,
-                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {},
+                 Plan&& arena_plan_in = {})
       : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank,
                             right_rank, std::forward<Perm>(perm),
-                            std::forward<ElemMultAddOp>(elem_muladd_op)) {}
+                            std::forward<ElemMultAddOp>(elem_muladd_op),
+                            std::forward<Plan>(arena_plan_in)) {}
 
   /// Create a result type object
 
@@ -313,8 +351,19 @@ class ContractReduce : public ContractReduceBase<Result, Left, Right, Scalar> {
   /// target
   /// \param[in] arg The argument that will be added to \c result
   void operator()(result_type& result, const result_type& arg) const {
-    using TiledArray::add_to;
-    add_to(result, arg);
+    if constexpr (
+        detail::is_contraction_arena_tot_v<
+            result_type,
+            std::remove_cv_t<std::remove_reference_t<first_argument_type>>,
+            std::remove_cv_t<std::remove_reference_t<second_argument_type>>>) {
+      // Two partial contraction results reduced from disjoint K-panel
+      // subsets can carry different inner-cell sparsity; union their shapes
+      // before accumulating.
+      detail::arena_tot_add_to(result, arg);
+    } else {
+      using TiledArray::add_to;
+      add_to(result, arg);
+    }
   }
 
   /// Contract a pair of tiles and add to a target tile
@@ -332,6 +381,26 @@ class ContractReduce : public ContractReduceBase<Result, Left, Right, Scalar> {
 
     if constexpr (!ContractReduceBase_::plain_tensors) {
       TA_ASSERT(this->elem_muladd_op());
+      if constexpr (detail::is_contraction_arena_tot_v<
+                        result_type,
+                        std::remove_cv_t<
+                            std::remove_reference_t<first_argument_type>>,
+                        std::remove_cv_t<
+                            std::remove_reference_t<second_argument_type>>>) {
+        // The result tile is shaped from operand inner cells. A SUMMA
+        // reduction streams K-panels one at a time: the first panel sizes the
+        // result; a later panel of a contracted-dimension-sparse ToT operand
+        // can touch inner cells the first panel left null, so each subsequent
+        // panel extends the result to cover its own cells.
+        if (this->arena_plan().has_value()) {
+          if (empty(result))
+            result = this->arena_plan()->reserve_and_construct(
+                left, right, this->gemm_helper());
+          else
+            this->arena_plan()->grow_to_cover(result, left, right,
+                                              this->gemm_helper());
+        }
+      }
       gemm(result, left, right, ContractReduceBase_::gemm_helper(),
            this->elem_muladd_op());
     } else {  // plain tensors
@@ -404,18 +473,23 @@ class ContractReduce<Result, Left, Right,
   template <
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
+      typename Plan = typename ContractReduceBase_::arena_plan_storage_t,
       typename = std::enable_if_t<
           TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
-                                const right_value_type&>>>
+                                const right_value_type&> &&
+          std::is_same_v<std::decay_t<Plan>,
+                         typename ContractReduceBase_::arena_plan_storage_t>>>
   ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op,
                  const scalar_type alpha, const unsigned int result_rank,
                  const unsigned int left_rank, const unsigned int right_rank,
-                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {},
+                 Plan&& arena_plan_in = {})
       : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank,
                             right_rank, std::forward<Perm>(perm),
-                            std::forward<ElemMultAddOp>(elem_muladd_op)) {}
+                            std::forward<ElemMultAddOp>(elem_muladd_op),
+                            std::forward<Plan>(arena_plan_in)) {}
 
   /// Create a result type object
 
@@ -530,18 +604,23 @@ class ContractReduce<Result, Left, Right,
   template <
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
+      typename Plan = typename ContractReduceBase_::arena_plan_storage_t,
       typename = std::enable_if_t<
           TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
-                                const right_value_type&>>>
+                                const right_value_type&> &&
+          std::is_same_v<std::decay_t<Plan>,
+                         typename ContractReduceBase_::arena_plan_storage_t>>>
   ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op,
                  const scalar_type alpha, const unsigned int result_rank,
                  const unsigned int left_rank, const unsigned int right_rank,
-                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {},
+                 Plan&& arena_plan_in = {})
       : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank,
                             right_rank, std::forward<Perm>(perm),
-                            std::forward<ElemMultAddOp>(elem_muladd_op)) {}
+                            std::forward<ElemMultAddOp>(elem_muladd_op),
+                            std::forward<Plan>(arena_plan_in)) {}
 
   /// Create a result type object
 
diff --git a/src/TiledArray/tile_op/mult.h b/src/TiledArray/tile_op/mult.h
index 329bf96e58..93ad37796f 100644
--- a/src/TiledArray/tile_op/mult.h
+++ b/src/TiledArray/tile_op/mult.h
@@ -27,6 +27,7 @@
 #define TILEDARRAY_TILE_OP_MULT_H__INCLUDED
 
 #include <TiledArray/error.h>
+#include <TiledArray/tensor/type_traits.h>
 #include <TiledArray/tile_op/tile_interface.h>
 #include <TiledArray/util/function.h>
 #include <TiledArray/zero_tensor.h>
@@ -79,6 +80,37 @@ class Mult {
   /// \note the lifetime is managed by the callee!
   TiledArray::function_ref<element_op_type> element_op_;
 
+  /// True when this Mult's result has view inner cells (e.g. ArenaTensor),
+  /// the only case in which tile_op_ is ever populated. Gates instantiation
+  /// of eval_tile_op so non-view result types (which need not provide a
+  /// `permute` member) are unaffected.
+  static constexpr bool uses_tile_op_ =
+      TiledArray::is_tensor_view_v<result_value_type>;
+
+  /// type-erased reference to a whole-tile op. When set, eval() delegates the
+  /// entire tile product to it. Used for arena tensor-of-tensors products
+  /// whose per-cell op cannot value-return (e.g. ArenaTensor view inner
+  /// cells), so the result tile must be shaped and filled as a unit.
+  /// \note the lifetime is managed by the callee!
+  TiledArray::function_ref<result_type(const left_type&, const right_type&)>
+      tile_op_;
+
+  /// Delegates the whole tile product to tile_op_.
+  result_type eval_tile_op(const left_type& first,
+                           const right_type& second) const {
+    return tile_op_(first, second);
+  }
+
+  /// Delegates the whole tile product to tile_op_, then permutes the result.
+  template <typename Perm, typename = std::enable_if_t<
+                               TiledArray::detail::is_permutation_v<Perm>>>
+  result_type eval_tile_op(const left_type& first, const right_type& second,
+                           const Perm& perm) const {
+    result_type result = tile_op_(first, second);
+    if (perm) result = result.permute(perm);
+    return result;
+  }
+
   // Permuting tile evaluation function
   // These operations cannot consume the argument tile since this operation
   // requires temporary storage space.
@@ -86,6 +118,9 @@ class Mult {
                                TiledArray::detail::is_permutation_v<Perm>>>
   result_type eval(const left_type& first, const right_type& second,
                    const Perm& perm) const {
+    if constexpr (uses_tile_op_) {
+      if (tile_op_) return eval_tile_op(first, second, perm);
+    }
     if (!element_op_) {
       using TiledArray::mult;
       return mult(first, second, perm);
@@ -117,6 +152,9 @@ class Mult {
   template <bool LC, bool RC,
             typename std::enable_if<!(LC || RC)>::type* = nullptr>
   result_type eval(const left_type& first, const right_type& second) const {
+    if constexpr (uses_tile_op_) {
+      if (tile_op_) return eval_tile_op(first, second);
+    }
     if (!element_op_) {
       using TiledArray::mult;
       return mult(first, second);
@@ -128,9 +166,21 @@ class Mult {
 
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   result_type eval(left_type& first, const right_type& second) const {
+    if constexpr (uses_tile_op_) {
+      if (tile_op_) return eval_tile_op(first, second);
+    }
     if (!element_op_) {
-      using TiledArray::mult_to;
-      return mult_to(std::move(first), second);
+      if constexpr (uses_tile_op_) {
+        // View inner cells (e.g. ArenaTensor): a "consumable" tile is a
+        // shallow handle whose arena slab may be aliased by a persistent
+        // array, so an in-place mult_to would corrupt that operand. Always
+        // produce a fresh result for view-cell tiles.
+        using TiledArray::mult;
+        return mult(first, second);
+      } else {
+        using TiledArray::mult_to;
+        return mult_to(std::move(first), second);
+      }
     } else {
       // TODO figure out why this does not compiles!!!
       //            using TiledArray::inplace_binary;
@@ -144,9 +194,19 @@ class Mult {
   template <bool LC, bool RC,
             typename std::enable_if<!LC && RC>::type* = nullptr>
   result_type eval(const left_type& first, right_type& second) const {
+    if constexpr (uses_tile_op_) {
+      if (tile_op_) return eval_tile_op(first, second);
+    }
     if (!element_op_) {
-      using TiledArray::mult_to;
-      return mult_to(std::move(second), first);
+      if constexpr (uses_tile_op_) {
+        // View inner cells: never consume a shallow handle in place (see the
+        // consume-left overload above).
+        using TiledArray::mult;
+        return mult(first, second);
+      } else {
+        using TiledArray::mult_to;
+        return mult_to(std::move(second), first);
+      }
     } else {  // WARNING: element_op_ might be noncommuting, so can't swap first
               // and second! for GEMM could optimize, but can't introspect
               // element_op_
@@ -195,6 +255,20 @@ class Mult {
                     const left_value_type&, const right_value_type&>>>
   explicit Mult(ElementOp&& op) : element_op_(std::forward<ElementOp>(op)) {}
 
+  /// Tag selecting the whole-tile-op constructor.
+  struct tile_op_tag {};
+
+  /// Construct using a whole-tile op. When set, eval() delegates the entire
+  /// tile product to \p op instead of multiplying element-wise. Used for
+  /// arena tensor-of-tensors products whose per-cell op cannot value-return.
+  /// \tparam TileOp a callable with signature
+  ///         `result_type(const left_type&, const right_type&)`
+  /// \param op the whole-tile operation
+  template <typename TileOp, typename = std::enable_if_t<std::is_invocable_r_v<
+                                 result_type, std::remove_reference_t<TileOp>,
+                                 const left_type&, const right_type&>>>
+  Mult(tile_op_tag, TileOp&& op) : tile_op_(std::forward<TileOp>(op)) {}
+
   /// Multiply-and-permute operator
 
   /// Compute the product of two tiles and permute the result.
diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h
index b18fc8c7ee..35b89365b8 100644
--- a/src/TiledArray/type_traits.h
+++ b/src/TiledArray/type_traits.h
@@ -377,6 +377,8 @@ GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(add)
 GENERATE_HAS_MEMBER_FUNCTION(add)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(add_to)
 GENERATE_HAS_MEMBER_FUNCTION(add_to)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(axpy_to)
+GENERATE_HAS_MEMBER_FUNCTION(axpy_to)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(subt)
 GENERATE_HAS_MEMBER_FUNCTION(subt)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(subt_to)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7a3840ea12..308a3dec1e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -103,6 +103,14 @@ set(ta_test_src_files  ta_test.cpp
     linalg.cpp
     cp.cpp
     btas.cpp
+    arena.cpp
+    arena_kernels.cpp
+    arena_einsum_unit_suite.cpp
+    arena_tot_trivial.cpp
+    arena_sizeof_invariant_suite.cpp
+    arena_tensor.cpp
+    arena_tensor_kernels.cpp
+    tot_construction.cpp
 )
 
 if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
@@ -186,3 +194,8 @@ else()
           ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT}"
           )
 endif()
+
+if (NOT TARGET test-cases-tiledarray)
+  add_custom_target_subproject(tiledarray test-cases)
+endif()
+add_subdirectory(cases)
diff --git a/tests/arena.cpp b/tests/arena.cpp
new file mode 100644
index 0000000000..46273e8645
--- /dev/null
+++ b/tests/arena.cpp
@@ -0,0 +1,131 @@
+#include "TiledArray/tensor/arena.h"
+
+#include "tiledarray.h"
+#include "unit_test_config.h"
+
+#include <cstddef>
+#include <memory>
+#include <memory_resource>
+#include <vector>
+
+using TiledArray::detail::Arena;
+using TiledArray::detail::ArenaPlan;
+using TiledArray::detail::ArenaResource;
+using TiledArray::detail::plan;
+
+namespace {
+// Minimal Range-like shim for plan() tests: supports only volume().
+struct FakeRange {
+  std::size_t v;
+  std::size_t volume() const noexcept { return v; }
+};
+}
+
+BOOST_AUTO_TEST_SUITE(arena_suite, TA_UT_LABEL_SERIAL)
+
+BOOST_AUTO_TEST_CASE(default_arena_is_empty) {
+  Arena a;
+  BOOST_CHECK_EQUAL(a.capacity(), 0u);
+  BOOST_CHECK_EQUAL(a.cursor(), 0u);
+  BOOST_CHECK(a.empty());
+  BOOST_CHECK(a.resource() != nullptr);
+}
+
+BOOST_AUTO_TEST_CASE(reserve_initializes_capacity) {
+  Arena a;
+  a.reserve(1024);
+  BOOST_CHECK_EQUAL(a.capacity(), 1024u);
+  BOOST_CHECK_EQUAL(a.cursor(), 0u);
+  BOOST_CHECK_EQUAL(a.remaining(), 1024u);
+}
+
+BOOST_AUTO_TEST_CASE(reserve_zero_init_clears_slab) {
+  Arena a;
+  a.reserve(64, /*zero_init=*/true);
+  auto h = a.slice<unsigned char>(0, 64);
+  for (std::size_t i = 0; i < 64; ++i) BOOST_CHECK_EQUAL(h[i], 0u);
+}
+
+BOOST_AUTO_TEST_CASE(slice_random_access_and_aliasing) {
+  Arena a;
+  a.reserve(1024);
+  std::shared_ptr<double[]> p1 = a.slice<double>(0, 4);
+  std::shared_ptr<double[]> p2 = a.slice<double>(64, 4);
+  for (int i = 0; i < 4; ++i) p1[i] = double(i);
+  for (int i = 0; i < 4; ++i) p2[i] = double(10 + i);
+  for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p1[i], double(i));
+  for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p2[i], double(10 + i));
+  BOOST_CHECK(static_cast<void*>(&p2[0]) >= static_cast<void*>(&p1[4]));
+}
+
+BOOST_AUTO_TEST_CASE(claim_advances_cursor_and_aligns) {
+  Arena a;
+  a.reserve(1024);
+  std::shared_ptr<double[]> h = a.claim<double>(10);
+  BOOST_REQUIRE(h.get() != nullptr);
+  BOOST_CHECK_EQUAL(reinterpret_cast<std::uintptr_t>(h.get()) % alignof(double),
+                    0u);
+  BOOST_CHECK(a.cursor() >= 10u * sizeof(double));
+}
+
+BOOST_AUTO_TEST_CASE(slab_survives_arena_destruction) {
+  std::shared_ptr<int[]> survivor;
+  {
+    Arena tmp;
+    tmp.reserve(256);
+    survivor = tmp.claim<int>(10);
+    for (int i = 0; i < 10; ++i) survivor[i] = -i;
+  }
+  for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(survivor[i], -i);
+}
+
+BOOST_AUTO_TEST_CASE(plan_uniform_cells) {
+  ArenaPlan p = plan(
+      /*N_cells=*/6,
+      /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{10}; },
+      /*element_size=*/sizeof(double),
+      /*alignment=*/alignof(double));
+  BOOST_CHECK_EQUAL(p.total_bytes, 6u * 10u * sizeof(double));
+  BOOST_CHECK_EQUAL(p.offsets.size(), 6u);
+  BOOST_CHECK_EQUAL(p.offsets[0], 0u);
+  BOOST_CHECK_EQUAL(p.offsets[5], 5u * 10u * sizeof(double));
+}
+
+BOOST_AUTO_TEST_CASE(plan_variable_cells_match_pivot_doc_example) {
+  ArenaPlan p = plan(
+      /*N_cells=*/12,
+      /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{20}; },
+      /*element_size=*/sizeof(double),
+      /*alignment=*/alignof(double));
+  BOOST_CHECK_EQUAL(p.total_bytes, 12u * 20u * sizeof(double));
+  BOOST_CHECK_EQUAL(p.offsets[1], 20u * sizeof(double));
+}
+
+BOOST_AUTO_TEST_CASE(plan_then_construct_then_read) {
+  const std::size_t N = 4;
+  std::vector<std::size_t> volumes = {3, 5, 2, 7};
+  auto shape_fn = [&volumes](std::size_t ord) { return FakeRange{volumes[ord]}; };
+  ArenaPlan p = plan(N, shape_fn, sizeof(double), alignof(double));
+  Arena a;
+  a.reserve(p.total_bytes);
+  std::vector<std::shared_ptr<double[]>> handles(N);
+  for (std::size_t ord = 0; ord < N; ++ord) {
+    handles[ord] = a.slice<double>(p.offsets[ord], volumes[ord]);
+    for (std::size_t i = 0; i < volumes[ord]; ++i)
+      handles[ord][i] = double(100 * ord + i);
+  }
+  for (std::size_t ord = 0; ord < N; ++ord)
+    for (std::size_t i = 0; i < volumes[ord]; ++i)
+      BOOST_CHECK_EQUAL(handles[ord][i], double(100 * ord + i));
+}
+
+BOOST_AUTO_TEST_CASE(arena_resource_is_identity_equal) {
+  Arena a;
+  a.reserve(64);
+  ArenaResource r1(&a);
+  ArenaResource r2(&a);
+  BOOST_CHECK(r1.is_equal(r1));
+  BOOST_CHECK(!r1.is_equal(r2));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/arena_einsum_unit_suite.cpp b/tests/arena_einsum_unit_suite.cpp
new file mode 100644
index 0000000000..6de13f8c84
--- /dev/null
+++ b/tests/arena_einsum_unit_suite.cpp
@@ -0,0 +1,253 @@
+/// Unit tests for arena einsum plans and dispatch.
+
+#include "TiledArray/tensor/arena_einsum.h"
+
+#include "tiledarray.h"
+#include "unit_test_config.h"
+
+BOOST_AUTO_TEST_SUITE(arena_einsum_unit_suite, TA_UT_LABEL_SERIAL)
+
+namespace TA = TiledArray;
+
+BOOST_AUTO_TEST_CASE(inner_shape_plan_left_range) {
+  TA::Tensor<double> l(TA::Range{3, 4});
+  TA::Tensor<double> r(TA::Range{3, 4});
+  TA::detail::ArenaInnerShapePlan p{
+      TA::detail::ArenaInnerShapeKind::left_range, std::nullopt};
+  auto out = p.make<TA::Range>(l, r);
+  BOOST_CHECK(out == l.range());
+}
+
+BOOST_AUTO_TEST_CASE(inner_shape_plan_right_range) {
+  TA::Tensor<double> l(TA::Range{3, 4});
+  TA::Tensor<double> r(TA::Range{5, 6});
+  TA::detail::ArenaInnerShapePlan p{
+      TA::detail::ArenaInnerShapeKind::right_range, std::nullopt};
+  auto out = p.make<TA::Range>(l, r);
+  BOOST_CHECK(out == r.range());
+}
+
+BOOST_AUTO_TEST_CASE(inner_shape_plan_gemm_result_range) {
+  TA::Tensor<double> l(TA::Range{3, 5});
+  TA::Tensor<double> r(TA::Range{5, 4});
+  TA::math::GemmHelper gh(TA::math::blas::NoTranspose,
+                           TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::detail::ArenaInnerShapePlan p{
+      TA::detail::ArenaInnerShapeKind::gemm_result_range,
+      std::make_optional(gh)};
+  auto out = p.make<TA::Range>(l, r);
+  BOOST_CHECK_EQUAL(out.volume(), std::size_t{12});
+}
+
+BOOST_AUTO_TEST_CASE(is_contraction_arena_tot_v_predicate) {
+  using ToT = TA::Tensor<TA::Tensor<double>>;
+  static_assert(TA::detail::is_contraction_arena_tot_v<ToT, ToT, ToT>);
+  using Plain = TA::Tensor<double>;
+  static_assert(!TA::detail::is_contraction_arena_tot_v<Plain, Plain, Plain>);
+  BOOST_CHECK(true);
+}
+
+BOOST_AUTO_TEST_CASE(arena_plan_storage_t_resolves) {
+  using ToT = TA::Tensor<TA::Tensor<double>>;
+  using Plain = TA::Tensor<double>;
+  using ToTStorage = TA::detail::arena_plan_storage_t<ToT, ToT, ToT>;
+  using PlainStorage = TA::detail::arena_plan_storage_t<Plain, Plain, Plain>;
+  static_assert(!std::is_same_v<ToTStorage, std::monostate>);
+  static_assert(std::is_same_v<PlainStorage, std::monostate>);
+  BOOST_CHECK(true);
+}
+
+BOOST_AUTO_TEST_CASE(make_plan_returns_nullopt_when_disabled) {
+  using ToT = TA::Tensor<TA::Tensor<double>>;
+  TA::detail::arena_disabled() = true;
+  auto plan = TA::detail::make_contraction_arena_plan<ToT, ToT, ToT>(
+      TA::detail::ArenaInnerShapeKind::left_range, std::nullopt,
+      TA::Permutation{});
+  BOOST_CHECK(!plan.has_value());
+  TA::detail::arena_disabled() = false;
+}
+
+BOOST_AUTO_TEST_CASE(make_plan_returns_nullopt_for_plain_tensor) {
+  using Plain = TA::Tensor<double>;
+  // Non-ToT gating happens inside the function body, not in the return type.
+  auto plan = TA::detail::make_contraction_arena_plan<Plain, Plain, Plain>(
+      TA::detail::ArenaInnerShapeKind::left_range, std::nullopt,
+      TA::Permutation{});
+  BOOST_CHECK(!plan.has_value());
+}
+
+BOOST_AUTO_TEST_CASE(make_plan_rejects_nonidentity_inner_perm) {
+  using ToT = TA::Tensor<TA::Tensor<double>>;
+  TA::Permutation perm({1, 0});
+  auto plan = TA::detail::make_contraction_arena_plan<ToT, ToT, ToT>(
+      TA::detail::ArenaInnerShapeKind::left_range, std::nullopt, perm);
+  BOOST_CHECK(!plan.has_value());
+}
+
+BOOST_AUTO_TEST_CASE(make_plan_returns_active_for_tot) {
+  using ToT = TA::Tensor<TA::Tensor<double>>;
+  auto plan = TA::detail::make_contraction_arena_plan<ToT, ToT, ToT>(
+      TA::detail::ArenaInnerShapeKind::left_range, std::nullopt,
+      TA::Permutation{});
+  BOOST_CHECK(plan.has_value());
+}
+
+namespace {
+using ToT = TA::Tensor<TA::Tensor<double>>;
+
+// Placement-new initializes each ToT inner cell in existing tensor storage.
+ToT make_uniform_tot(const TA::Range& outer, const TA::Range& inner,
+                     double fill) {
+  ToT t(outer);
+  const std::size_t vol = outer.volume();
+  for (std::size_t i = 0; i < vol; ++i) {
+    new (t.data() + i) TA::Tensor<double>(inner, fill);
+  }
+  return t;
+}
+}  // namespace
+
+BOOST_AUTO_TEST_CASE(reserve_and_construct_uniform_inner) {
+  TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose,
+                                  TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose,
+                                  TA::math::blas::NoTranspose, 2, 2, 2);
+  auto left  = make_uniform_tot(TA::Range{2, 3}, TA::Range{3, 5}, 1.0);
+  auto right = make_uniform_tot(TA::Range{3, 4}, TA::Range{5, 4}, 1.0);
+  TA::detail::ArenaInnerShapePlan inner_plan{
+      TA::detail::ArenaInnerShapeKind::gemm_result_range,
+      std::make_optional(inner_gh)};
+  TA::detail::ContractionArenaPlan<ToT, ToT, ToT> plan(inner_plan);
+  ToT result = plan.reserve_and_construct(left, right, outer_gh);
+  BOOST_CHECK_EQUAL(result.range().volume(), std::size_t{8});
+  BOOST_CHECK_EQUAL(result.data()[0].range().volume(), std::size_t{12});
+}
+
+BOOST_AUTO_TEST_CASE(reserve_and_construct_zero_volume_outer_skips_reserve) {
+  TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose,
+                                  TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose,
+                                  TA::math::blas::NoTranspose, 2, 2, 2);
+  auto left  = make_uniform_tot(TA::Range{0, 3}, TA::Range{3, 5}, 1.0);
+  auto right = make_uniform_tot(TA::Range{3, 2}, TA::Range{5, 4}, 1.0);
+  TA::detail::ArenaInnerShapePlan inner_plan{
+      TA::detail::ArenaInnerShapeKind::gemm_result_range,
+      std::make_optional(inner_gh)};
+  TA::detail::ContractionArenaPlan<ToT, ToT, ToT> plan(inner_plan);
+  ToT result = plan.reserve_and_construct(left, right, outer_gh);
+  BOOST_CHECK_EQUAL(result.range().volume(), std::size_t{0});
+}
+
+BOOST_AUTO_TEST_CASE(reserve_and_construct_jagged_inner_per_cell) {
+  // Jagged left cells make first-non-empty K-strip range selection observable.
+  TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose,
+                                  TA::math::blas::NoTranspose, 2, 2, 2);
+  ToT left(TA::Range{2, 3});
+  for (std::size_t m = 0; m < 2; ++m)
+    for (std::size_t k = 0; k < 3; ++k) {
+      TA::Range r{static_cast<long>(m + 1), static_cast<long>(k + 2)};
+      new (left.data() + (m * 3 + k)) TA::Tensor<double>(r, 1.0);
+    }
+  auto right = make_uniform_tot(TA::Range{3, 2}, TA::Range{2, 2}, 1.0);
+  TA::detail::ArenaInnerShapePlan inner_plan{
+      TA::detail::ArenaInnerShapeKind::left_range, std::nullopt};
+  TA::detail::ContractionArenaPlan<ToT, ToT, ToT> plan(inner_plan);
+  ToT result = plan.reserve_and_construct(left, right, outer_gh);
+  BOOST_CHECK_EQUAL(result.range().volume(), std::size_t{4});
+  BOOST_CHECK_EQUAL(result.data()[0].range().volume(), std::size_t{2});
+  BOOST_CHECK_EQUAL(result.data()[1].range().volume(), std::size_t{2});
+  BOOST_CHECK_EQUAL(result.data()[2].range().volume(), std::size_t{4});
+  BOOST_CHECK_EQUAL(result.data()[3].range().volume(), std::size_t{4});
+}
+
+BOOST_AUTO_TEST_CASE(fused_hadamard_inplace_accumulates) {
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> l(TA::Range{4}, 1.0);
+  TA::Tensor<double> rr(TA::Range{4}, 2.0);
+  TA::detail::fused_hadamard_inplace(r, l, rr);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 2.0, 1e-12);
+  TA::detail::fused_hadamard_inplace(r, l, rr);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 4.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_hadamard_scaled_inplace_accumulates) {
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> l(TA::Range{4}, 1.0);
+  TA::Tensor<double> rr(TA::Range{4}, 2.0);
+  TA::detail::fused_hadamard_scaled_inplace(r, l, rr, 3.0);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 6.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_scale_tot_x_t_inplace_accumulates) {
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> l(TA::Range{4}, 1.5);
+  TA::detail::fused_scale_tot_x_t_inplace(r, l, 2.0);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 3.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_scale_t_x_tot_inplace_accumulates) {
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> rr(TA::Range{4}, 2.5);
+  TA::detail::fused_scale_t_x_tot_inplace(r, 4.0, rr);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 10.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_contraction_inplace_accumulates) {
+  TA::Tensor<double> r(TA::Range{2, 2}, 0.0);
+  TA::Tensor<double> l(TA::Range{2, 2}, 1.0);
+  TA::Tensor<double> rr(TA::Range{2, 2}, 2.0);
+  TA::math::GemmHelper gh(TA::math::blas::NoTranspose,
+                           TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::detail::fused_contraction_inplace(r, l, rr, 1.0, gh);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 4.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_hadamard_lambda_round_trip) {
+  auto fn = TA::detail::make_fused_hadamard_lambda<
+      TA::Tensor<double>, TA::Tensor<double>, TA::Tensor<double>>();
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> l(TA::Range{4}, 1.0);
+  TA::Tensor<double> rr(TA::Range{4}, 2.0);
+  fn(r, l, rr);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 2.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_hadamard_scaled_lambda_round_trip) {
+  auto fn = TA::detail::make_fused_hadamard_scaled_lambda<
+      TA::Tensor<double>, TA::Tensor<double>, TA::Tensor<double>, double>(3.0);
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> l(TA::Range{4}, 1.0);
+  TA::Tensor<double> rr(TA::Range{4}, 2.0);
+  fn(r, l, rr);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 6.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_scale_tot_x_t_lambda_round_trip) {
+  auto fn = TA::detail::make_fused_scale_tot_x_t_lambda<
+      TA::Tensor<double>, TA::Tensor<double>, double>();
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> l(TA::Range{4}, 1.5);
+  fn(r, l, 2.0);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 3.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(fused_scale_t_x_tot_lambda_round_trip) {
+  auto fn = TA::detail::make_fused_scale_t_x_tot_lambda<
+      TA::Tensor<double>, double, TA::Tensor<double>>();
+  TA::Tensor<double> r(TA::Range{4}, 0.0);
+  TA::Tensor<double> rr(TA::Range{4}, 2.5);
+  fn(r, 4.0, rr);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_CLOSE(r.data()[i], 10.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/arena_kernels.cpp b/tests/arena_kernels.cpp
new file mode 100644
index 0000000000..4e278fd495
--- /dev/null
+++ b/tests/arena_kernels.cpp
@@ -0,0 +1,160 @@
+/// Unit tests for arena-backed ToT kernels.
+
+#include "TiledArray/tensor/arena_kernels.h"
+
+#include "TiledArray/tensor.h"
+#include "TiledArray/tensor/arena.h"
+#include "tiledarray.h"
+#include "unit_test_config.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace TA = TiledArray;
+using inner_t = TA::Tensor<double>;
+using outer_t = TA::Tensor<inner_t>;
+
+namespace {
+
+outer_t make_tot(std::size_t N_outer, std::size_t n_inner, double base = 1.0) {
+  outer_t outer(TA::Range{static_cast<long>(N_outer)}, 1);
+  for (std::size_t ord = 0; ord < N_outer; ++ord) {
+    inner_t inner(TA::Range{static_cast<long>(n_inner)});
+    for (std::size_t i = 0; i < n_inner; ++i)
+      inner.at_ordinal(i) = base + ord * 100.0 + i;
+    *(outer.data() + ord) = std::move(inner);
+  }
+  return outer;
+}
+
+bool tot_equal(const outer_t& a, const outer_t& b) {
+  if (a.range().volume() != b.range().volume()) return false;
+  for (std::size_t ord = 0; ord < a.range().volume(); ++ord) {
+    const inner_t& ai = *(a.data() + ord);
+    const inner_t& bi = *(b.data() + ord);
+    if (ai.range().volume() != bi.range().volume()) return false;
+    for (std::size_t i = 0; i < ai.range().volume(); ++i)
+      if (ai.at_ordinal(i) != bi.at_ordinal(i)) return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+BOOST_AUTO_TEST_SUITE(arena_kernels_suite, TA_UT_LABEL_SERIAL)
+
+BOOST_AUTO_TEST_CASE(trivial_unary_clone_matches_heap_baseline) {
+  outer_t src = make_tot(4, 5, 1.0);
+  auto fill = [](double* dst, const double* src, std::size_t n) {
+    for (std::size_t i = 0; i < n; ++i) dst[i] = src[i];
+  };
+  outer_t arena_result = TA::detail::arena_trivial_unary<outer_t>(src, fill);
+  BOOST_CHECK(tot_equal(arena_result, src));
+}
+
+BOOST_AUTO_TEST_CASE(trivial_unary_scale_matches_heap_baseline) {
+  outer_t src = make_tot(4, 5, 1.0);
+  const double factor = 2.5;
+  auto fill = [factor](double* dst, const double* src, std::size_t n) {
+    for (std::size_t i = 0; i < n; ++i) dst[i] = src[i] * factor;
+  };
+  outer_t arena_result = TA::detail::arena_trivial_unary<outer_t>(src, fill);
+  outer_t baseline(src.range(), 1);
+  for (std::size_t ord = 0; ord < src.range().volume(); ++ord) {
+    inner_t inner((src.data() + ord)->range());
+    for (std::size_t i = 0; i < inner.range().volume(); ++i)
+      inner.at_ordinal(i) = (src.data() + ord)->at_ordinal(i) * factor;
+    *(baseline.data() + ord) = std::move(inner);
+  }
+  BOOST_CHECK(tot_equal(arena_result, baseline));
+}
+
+BOOST_AUTO_TEST_CASE(trivial_binary_add_matches_heap_baseline) {
+  outer_t L = make_tot(4, 5, 1.0);
+  outer_t R = make_tot(4, 5, 0.5);
+  auto fill = [](double* dst, const double* l, const double* r, std::size_t n) {
+    for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] + r[i];
+  };
+  outer_t arena_result = TA::detail::arena_trivial_binary<outer_t>(L, R, fill);
+  outer_t baseline(L.range(), 1);
+  for (std::size_t ord = 0; ord < L.range().volume(); ++ord) {
+    inner_t inner((L.data() + ord)->range());
+    for (std::size_t i = 0; i < inner.range().volume(); ++i)
+      inner.at_ordinal(i) =
+          (L.data() + ord)->at_ordinal(i) + (R.data() + ord)->at_ordinal(i);
+    *(baseline.data() + ord) = std::move(inner);
+  }
+  BOOST_CHECK(tot_equal(arena_result, baseline));
+}
+
+BOOST_AUTO_TEST_CASE(arena_outlives_kernel_call) {
+  // The result data deleter co-owns the arena.
+  outer_t arena_result;
+  {
+    outer_t src = make_tot(3, 4, 7.0);
+    auto fill = [](double* dst, const double* src, std::size_t n) {
+      for (std::size_t i = 0; i < n; ++i) dst[i] = src[i];
+    };
+    arena_result = TA::detail::arena_trivial_unary<outer_t>(src, fill);
+  }
+  for (std::size_t ord = 0; ord < arena_result.range().volume(); ++ord)
+    for (std::size_t i = 0; i < (arena_result.data() + ord)->range().volume();
+         ++i)
+      BOOST_CHECK_EQUAL((arena_result.data() + ord)->at_ordinal(i),
+                        7.0 + ord * 100.0 + i);
+}
+
+BOOST_AUTO_TEST_CASE(inner_permute_transposes_each_cell) {
+  // outer tile of 3 cells, each a non-uniform r x c inner matrix
+  outer_t src(TA::Range{3}, 1);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const long r = 2 + static_cast<long>(ord);      // 2, 3, 4
+    const long c = 3 + static_cast<long>(ord % 2);  // 3, 4, 3
+    inner_t inner(TA::Range{r, c});
+    for (long i = 0; i < r; ++i)
+      for (long j = 0; j < c; ++j)
+        inner(i, j) = 1.0 + ord * 100.0 + i * 10.0 + j;
+    *(src.data() + ord) = std::move(inner);
+  }
+  auto result =
+      TA::detail::arena_inner_permute<outer_t>(src, TA::Permutation{1, 0});
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const inner_t& s = *(src.data() + ord);
+    const inner_t& d = *(result.data() + ord);
+    const long r = s.range().extent(0);
+    const long c = s.range().extent(1);
+    BOOST_REQUIRE_EQUAL(d.range().rank(), 2u);
+    BOOST_CHECK_EQUAL(d.range().extent(0), c);
+    BOOST_CHECK_EQUAL(d.range().extent(1), r);
+    for (long i = 0; i < r; ++i)
+      for (long j = 0; j < c; ++j) BOOST_CHECK_EQUAL(d(j, i), s(i, j));
+  }
+}
+
+BOOST_AUTO_TEST_CASE(inner_permute_rank3_cell) {
+  outer_t src(TA::Range{2}, 1);
+  const long e0 = 2, e1 = 3, e2 = 4;
+  for (std::size_t ord = 0; ord < 2; ++ord) {
+    inner_t inner(TA::Range{e0, e1, e2});
+    for (long a = 0; a < e0; ++a)
+      for (long b = 0; b < e1; ++b)
+        for (long c = 0; c < e2; ++c)
+          inner(a, b, c) = ord * 1000.0 + a * 100.0 + b * 10.0 + c;
+    *(src.data() + ord) = std::move(inner);
+  }
+  // perm {2,0,1}: src dim 0->2, 1->0, 2->1 => result(b,c,a) == src(a,b,c)
+  auto result =
+      TA::detail::arena_inner_permute<outer_t>(src, TA::Permutation{2, 0, 1});
+  for (std::size_t ord = 0; ord < 2; ++ord) {
+    const inner_t& s = *(src.data() + ord);
+    const inner_t& d = *(result.data() + ord);
+    BOOST_CHECK_EQUAL(d.range().extent(0), e1);
+    BOOST_CHECK_EQUAL(d.range().extent(1), e2);
+    BOOST_CHECK_EQUAL(d.range().extent(2), e0);
+    for (long a = 0; a < e0; ++a)
+      for (long b = 0; b < e1; ++b)
+        for (long c = 0; c < e2; ++c) BOOST_CHECK_EQUAL(d(b, c, a), s(a, b, c));
+  }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/arena_sizeof_invariant_suite.cpp b/tests/arena_sizeof_invariant_suite.cpp
new file mode 100644
index 0000000000..649e3a50c5
--- /dev/null
+++ b/tests/arena_sizeof_invariant_suite.cpp
@@ -0,0 +1,64 @@
+/// Locks plain-tensor zero-overhead from the arena plan storage field.
+
+#include "TiledArray/tensor.h"
+#include "TiledArray/tensor/arena_einsum.h"
+#include "TiledArray/util/function.h"
+#include "tiledarray.h"
+#include "unit_test_config.h"
+
+#include <cstddef>
+#include <type_traits>
+#include <variant>
+
+namespace TA = TiledArray;
+
+namespace {
+
+using PlainResult = TA::Tensor<double>;
+using PlainLeft = TA::Tensor<double>;
+using PlainRight = TA::Tensor<double>;
+using PlainScalar = double;
+
+using PlainArenaPlanStorage =
+    TA::detail::arena_plan_storage_t<PlainResult, PlainLeft, PlainRight>;
+
+using PlainElemMulAddOp =
+    TA::function_ref<void(double&, const double&, const double&)>;
+
+/// Shadows the public field order of ContractReduceBase::Impl on master.
+struct ImplLayoutMaster {
+  TA::math::GemmHelper gemm_helper_;
+  PlainScalar alpha_;
+  TA::BipartitePermutation perm_;
+  PlainElemMulAddOp elem_muladd_op_;
+};
+
+/// Same as ImplLayoutMaster + trailing TA_NO_UNIQUE_ADDRESS arena_plan_.
+struct ImplLayoutAllocator {
+  TA::math::GemmHelper gemm_helper_;
+  PlainScalar alpha_;
+  TA::BipartitePermutation perm_;
+  PlainElemMulAddOp elem_muladd_op_;
+  TA_NO_UNIQUE_ADDRESS PlainArenaPlanStorage arena_plan_;
+};
+
+static_assert(std::is_same_v<PlainArenaPlanStorage, std::monostate>,
+              "plain-tensor arena_plan_storage_t must be std::monostate");
+
+static_assert(sizeof(ImplLayoutAllocator) == sizeof(ImplLayoutMaster),
+              "TA_NO_UNIQUE_ADDRESS failed to fold arena_plan_ into padding");
+
+}
+
+BOOST_AUTO_TEST_SUITE(arena_sizeof_invariant_suite, TA_UT_LABEL_SERIAL)
+
+BOOST_AUTO_TEST_CASE(impl_layout_no_unique_address_invariant) {
+  BOOST_CHECK_EQUAL(sizeof(ImplLayoutAllocator), sizeof(ImplLayoutMaster));
+}
+
+BOOST_AUTO_TEST_CASE(plain_arena_plan_storage_is_monostate) {
+  BOOST_CHECK((std::is_same_v<PlainArenaPlanStorage, std::monostate>));
+  BOOST_CHECK_EQUAL(sizeof(PlainArenaPlanStorage), sizeof(std::monostate));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/arena_tensor.cpp b/tests/arena_tensor.cpp
new file mode 100644
index 0000000000..9b47e1116f
--- /dev/null
+++ b/tests/arena_tensor.cpp
@@ -0,0 +1,338 @@
+/// Unit tests for TiledArray::ArenaTensor: null state, view copy/move,
+/// foreign-tensor assignment, in-place CPOs, materialize.
+
+#include "TiledArray/tensor/arena_tensor.h"
+
+#include "TiledArray/external/btas.h"
+#include "TiledArray/tensor.h"
+#include "TiledArray/tensor/tensor_map.h"
+#include "tiledarray.h"
+#include "unit_test_config.h"
+
+#include <btas/tensorview.h>
+#include <btas/zb/range.h>
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace TA = TiledArray;
+// Tests use TA::Range explicitly so the standalone target for materialize()
+// is the natural TA::Tensor<double>; the type's default range is
+// btas::zb::RangeNd, which pairs with btas::Tensor as the standalone.
+using Inner = TA::ArenaTensor<double, TA::Range>;
+
+namespace {
+
+/// Holds an over-aligned byte buffer big enough for a single `Inner` cell of
+/// `n` elements.
+struct CellBuf {
+  std::vector<std::byte> bytes;
+  std::byte* aligned_ptr = nullptr;
+
+  explicit CellBuf(std::size_t n_elems) {
+    const std::size_t total = Inner::cell_size(n_elems);
+    const std::size_t algn = Inner::cell_alignment();
+    bytes.assign(total + algn, std::byte{0});
+    auto base = reinterpret_cast<std::uintptr_t>(bytes.data());
+    auto aligned = (base + algn - 1) & ~(algn - 1);
+    aligned_ptr = reinterpret_cast<std::byte*>(aligned);
+  }
+};
+
+}  // namespace
+
+BOOST_AUTO_TEST_SUITE(arena_tensor_suite, TA_UT_LABEL_SERIAL)
+
+BOOST_AUTO_TEST_CASE(sizeof_is_one_pointer) {
+  BOOST_CHECK_EQUAL(sizeof(Inner), sizeof(void*));
+}
+
+BOOST_AUTO_TEST_CASE(sizeof_invariant_across_range_parameterizations) {
+  // `ArenaTensor`'s footprint must be one pointer regardless of the range
+  // template parameter -- this is the original motivation for the type. The
+  // default `btas::zb::RangeNd<>` (~14 B + alignment) and `TA::Range`
+  // (~300 B) both go behind the same `Cell*` indirection.
+  static_assert(sizeof(TA::ArenaTensor<double>) == sizeof(void*),
+                "default-range ArenaTensor<double> must be one pointer");
+  static_assert(
+      sizeof(TA::ArenaTensor<double, ::btas::zb::RangeNd<>>) == sizeof(void*),
+      "zb::RangeNd ArenaTensor<double> must be one pointer");
+  static_assert(sizeof(TA::ArenaTensor<double, TA::Range>) == sizeof(void*),
+                "TA::Range ArenaTensor<double> must be one pointer");
+  // Different element type same story.
+  static_assert(sizeof(TA::ArenaTensor<float>) == sizeof(void*));
+  static_assert(sizeof(TA::ArenaTensor<std::complex<double>>) == sizeof(void*));
+  BOOST_CHECK(true);
+}
+
+BOOST_AUTO_TEST_CASE(element_data_is_simd_aligned) {
+  // data_alignment() should be at least kInnerSimdAlign; cell_alignment()
+  // should propagate that so the element pointer is SIMD-aligned.
+  BOOST_CHECK(Inner::data_alignment() >= TA::kInnerSimdAlign);
+  BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kInnerSimdAlign, 0u);
+  BOOST_CHECK(Inner::cell_alignment() >= Inner::data_alignment());
+  CellBuf buf(8);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{8});
+  auto addr = reinterpret_cast<std::uintptr_t>(x.data());
+  BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+}
+
+BOOST_AUTO_TEST_CASE(default_constructed_is_null) {
+  Inner x;
+  BOOST_CHECK(!x);
+  BOOST_CHECK(x.empty());
+  BOOST_CHECK_EQUAL(x.size(), 0u);
+  BOOST_CHECK(x.data() == nullptr);
+}
+
+BOOST_AUTO_TEST_CASE(make_arena_tensor_zero_initialized) {
+  CellBuf buf(6);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{6});
+  BOOST_REQUIRE(bool(x));
+  BOOST_CHECK(!x.empty());
+  BOOST_CHECK_EQUAL(x.size(), 6u);
+  for (std::size_t i = 0; i < x.size(); ++i)
+    BOOST_CHECK_EQUAL(x.data()[i], 0.0);
+}
+
+BOOST_AUTO_TEST_CASE(copy_construction_yields_alias) {
+  CellBuf buf(4);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{4});
+  Inner y = x;
+  BOOST_CHECK(bool(x));
+  BOOST_CHECK(bool(y));
+  BOOST_CHECK_EQUAL(x.data(), y.data());
+  y.data()[0] = 42.0;
+  BOOST_CHECK_EQUAL(x.data()[0], 42.0);
+}
+
+BOOST_AUTO_TEST_CASE(move_leaves_source_null) {
+  CellBuf buf(4);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{4});
+  Inner y = std::move(x);
+  BOOST_CHECK(!x);
+  BOOST_CHECK(bool(y));
+  BOOST_CHECK_EQUAL(y.size(), 4u);
+}
+
+BOOST_AUTO_TEST_CASE(operator_assign_from_ta_tensor_copies_elements) {
+  CellBuf buf(5);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{5});
+  TA::Tensor<double> src(TA::Range{5}, 0.0);
+  for (std::size_t i = 0; i < 5; ++i) src.data()[i] = double(i + 1);
+  x = src;
+  for (std::size_t i = 0; i < 5; ++i)
+    BOOST_CHECK_EQUAL(x.data()[i], double(i + 1));
+}
+
+BOOST_AUTO_TEST_CASE(zero_fills_with_zeros) {
+  CellBuf buf(4);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{4});
+  for (std::size_t i = 0; i < 4; ++i) x.data()[i] = 7.0;
+  TA::zero(x);
+  for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(x.data()[i], 0.0);
+}
+
+BOOST_AUTO_TEST_CASE(fill_sets_all_elements) {
+  CellBuf buf(4);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{4});
+  TA::fill(x, 3.5);
+  for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(x.data()[i], 3.5);
+}
+
+BOOST_AUTO_TEST_CASE(scale_to_multiplies_in_place) {
+  CellBuf buf(4);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{4});
+  TA::fill(x, 2.0);
+  TA::scale_to(x, 3.0);
+  for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(x.data()[i], 6.0);
+}
+
+BOOST_AUTO_TEST_CASE(add_to_accumulates) {
+  CellBuf bd(4), bs(4);
+  Inner dst =
+      TA::detail::make_arena_tensor_in<double>(bd.aligned_ptr, TA::Range{4});
+  Inner src =
+      TA::detail::make_arena_tensor_in<double>(bs.aligned_ptr, TA::Range{4});
+  TA::fill(dst, 1.0);
+  TA::fill(src, 2.0);
+  TA::add_to(dst, src);
+  for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 3.0);
+}
+
+BOOST_AUTO_TEST_CASE(subt_to_subtracts) {
+  CellBuf bd(4), bs(4);
+  Inner dst =
+      TA::detail::make_arena_tensor_in<double>(bd.aligned_ptr, TA::Range{4});
+  Inner src =
+      TA::detail::make_arena_tensor_in<double>(bs.aligned_ptr, TA::Range{4});
+  TA::fill(dst, 5.0);
+  TA::fill(src, 2.0);
+  TA::subt_to(dst, src);
+  for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 3.0);
+}
+
+BOOST_AUTO_TEST_CASE(mult_to_does_elementwise) {
+  CellBuf bd(4), bs(4);
+  Inner dst =
+      TA::detail::make_arena_tensor_in<double>(bd.aligned_ptr, TA::Range{4});
+  Inner src =
+      TA::detail::make_arena_tensor_in<double>(bs.aligned_ptr, TA::Range{4});
+  TA::fill(dst, 4.0);
+  TA::fill(src, 0.5);
+  TA::mult_to(dst, src);
+  for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 2.0);
+}
+
+BOOST_AUTO_TEST_CASE(axpy_to_scales_and_adds) {
+  CellBuf bd(4), bs(4);
+  Inner dst =
+      TA::detail::make_arena_tensor_in<double>(bd.aligned_ptr, TA::Range{4});
+  Inner src =
+      TA::detail::make_arena_tensor_in<double>(bs.aligned_ptr, TA::Range{4});
+  TA::fill(dst, 1.0);
+  TA::fill(src, 2.0);
+  // axpy_to(y, x, alpha): y += alpha * x (in-place, BLAS-like AXPY).
+  TA::axpy_to(dst, src, 3.0);
+  for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 7.0);
+}
+
+BOOST_AUTO_TEST_CASE(squared_norm_sums_squares) {
+  CellBuf buf(3);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{3});
+  x.data()[0] = 1.0;
+  x.data()[1] = 2.0;
+  x.data()[2] = 2.0;
+  BOOST_CHECK_EQUAL(TA::squared_norm(x), 9.0);
+}
+
+BOOST_AUTO_TEST_CASE(in_place_cpos_no_op_on_null) {
+  Inner null;
+  TA::zero(null);
+  TA::fill(null, 1.0);
+  TA::scale_to(null, 2.0);
+  TA::add_to(null, null);
+  BOOST_CHECK_EQUAL(TA::squared_norm(null), 0.0);
+}
+
+BOOST_AUTO_TEST_CASE(materialize_returns_independent_standalone) {
+  CellBuf buf(4);
+  Inner x =
+      TA::detail::make_arena_tensor_in<double>(buf.aligned_ptr, TA::Range{4});
+  for (std::size_t i = 0; i < 4; ++i) x.data()[i] = double(i);
+  auto standalone = TA::materialize<TA::Tensor<double>>(x);
+  BOOST_REQUIRE_EQUAL(standalone.range().volume(), 4u);
+  for (std::size_t i = 0; i < 4; ++i)
+    BOOST_CHECK_EQUAL(standalone.data()[i], double(i));
+  standalone.data()[0] = 99.0;
+  BOOST_CHECK_EQUAL(x.data()[0], 0.0);
+}
+
+BOOST_AUTO_TEST_CASE(materialize_null_yields_empty_standalone) {
+  Inner null;
+  auto standalone = TA::materialize<TA::Tensor<double>>(null);
+  BOOST_CHECK(standalone.empty());
+}
+
+BOOST_AUTO_TEST_CASE(is_arena_tensor_v_predicate) {
+  static_assert(TA::is_arena_tensor_v<Inner>);
+  static_assert(!TA::is_arena_tensor_v<TA::Tensor<double>>);
+  static_assert(!TA::is_arena_tensor_v<double>);
+  BOOST_CHECK(true);
+}
+
+BOOST_AUTO_TEST_CASE(is_tensor_view_v_predicate) {
+  // ArenaTensor is a view that lacks value-returning member arithmetic --
+  // it cannot allocate on its own. `is_tensor_view_v` is the predicate that
+  // opts such types out of value-returning operator dispatch.
+  static_assert(TA::is_tensor_view_v<Inner>);
+  // btas::TensorView is also a view without member arithmetic.
+  static_assert(TA::is_tensor_view_v<btas::TensorView<double>>);
+  // TA::TensorMap (TensorInterface) is non-owning, but DOES provide
+  // value-returning member arithmetic (it materializes a fresh tensor), so
+  // it is intentionally NOT in `is_tensor_view`.
+  static_assert(!TA::is_tensor_view_v<TA::TensorMap<double>>);
+  static_assert(!TA::is_tensor_view_v<TA::TensorConstMap<double>>);
+  // Value-semantic tensors and scalars are not views.
+  static_assert(!TA::is_tensor_view_v<TA::Tensor<double>>);
+  static_assert(!TA::is_tensor_view_v<btas::Tensor<double>>);
+  static_assert(!TA::is_tensor_view_v<double>);
+  // Layering: is_arena_tensor_v implies is_tensor_view_v.
+  static_assert(!TA::is_arena_tensor_v<TA::TensorMap<double>>);
+  static_assert(!TA::is_arena_tensor_v<btas::TensorView<double>>);
+  BOOST_CHECK(true);
+}
+
+BOOST_AUTO_TEST_CASE(gemm_inner_matrix_product) {
+  // C[3,5] += A[3,4] * B[4,5]; A is 1..12 row-major, B is 0.0,0.5,...,9.5.
+  CellBuf bl(12), br(20), bc(15);
+  Inner left =
+      TA::detail::make_arena_tensor_in<double>(bl.aligned_ptr, TA::Range{3, 4});
+  Inner right =
+      TA::detail::make_arena_tensor_in<double>(br.aligned_ptr, TA::Range{4, 5});
+  Inner result =
+      TA::detail::make_arena_tensor_in<double>(bc.aligned_ptr, TA::Range{3, 5});
+  for (int i = 0; i < 12; ++i) left.data()[i] = double(i + 1);
+  for (int i = 0; i < 20; ++i) right.data()[i] = 0.5 * double(i);
+  TA::zero(result);
+
+  TA::math::GemmHelper helper(TA::math::blas::NoTranspose,
+                              TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::gemm(result, left, right, 1.0, helper);
+
+  // Row-major reference: ref[i,k] = sum_j A[i,j] * B[j,k].
+  double ref[15] = {0};
+  for (int i = 0; i < 3; ++i)
+    for (int k = 0; k < 5; ++k)
+      for (int j = 0; j < 4; ++j)
+        ref[i * 5 + k] += left.data()[i * 4 + j] * right.data()[j * 5 + k];
+  for (int i = 0; i < 15; ++i)
+    BOOST_CHECK_CLOSE(result.data()[i], ref[i], 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(gemm_inner_accumulates_into_result) {
+  // C starts at known nonzero, gemm accumulates (beta=1).
+  CellBuf bl(4), br(4), bc(4);
+  Inner left =
+      TA::detail::make_arena_tensor_in<double>(bl.aligned_ptr, TA::Range{2, 2});
+  Inner right =
+      TA::detail::make_arena_tensor_in<double>(br.aligned_ptr, TA::Range{2, 2});
+  Inner result =
+      TA::detail::make_arena_tensor_in<double>(bc.aligned_ptr, TA::Range{2, 2});
+  TA::fill(left, 1.0);
+  TA::fill(right, 2.0);
+  TA::fill(result, 10.0);  // preload
+
+  TA::math::GemmHelper helper(TA::math::blas::NoTranspose,
+                              TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::gemm(result, left, right, 1.0, helper);
+  // Each result entry: 10 (preload) + 2 (sum_j 1*2 over j=0..1) = 14.
+  for (int i = 0; i < 4; ++i) BOOST_CHECK_CLOSE(result.data()[i], 14.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(gemm_inner_skips_when_operand_null) {
+  // Null operands -> result unchanged (no-op).
+  CellBuf bc(4);
+  Inner result =
+      TA::detail::make_arena_tensor_in<double>(bc.aligned_ptr, TA::Range{2, 2});
+  TA::fill(result, 7.0);
+  Inner null_inner;
+  TA::math::GemmHelper helper(TA::math::blas::NoTranspose,
+                              TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::gemm(result, null_inner, null_inner, 1.0, helper);
+  for (int i = 0; i < 4; ++i) BOOST_CHECK_CLOSE(result.data()[i], 7.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp
new file mode 100644
index 0000000000..fea3046aa8
--- /dev/null
+++ b/tests/arena_tensor_kernels.cpp
@@ -0,0 +1,633 @@
+/// Tests for the arena-backed factory that builds an outer tile of
+/// `ArenaTensor` cells: SIMD-aligned data, null cells for zero-volume
+/// shapes, monotonic slab layout, slab survives factory scope.
+
+#include "TiledArray/tensor/arena_kernels.h"
+
+#include "TiledArray/tensor.h"
+#include "TiledArray/tensor/arena_einsum.h"
+#include "tiledarray.h"
+#include "unit_test_config.h"
+
+#include <optional>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace TA = TiledArray;
+using Inner = TA::ArenaTensor<double, TA::Range>;
+using Outer = TA::Tensor<Inner>;
+
+BOOST_AUTO_TEST_SUITE(arena_tensor_kernels_suite, TA_UT_LABEL_SERIAL)
+
+BOOST_AUTO_TEST_CASE(builds_outer_with_uniform_inners) {
+  TA::Range outer_r{4};
+  auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{8}; };
+  Outer outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  BOOST_REQUIRE_EQUAL(outer.range().volume(), 4u);
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    Inner& inner = outer.data()[ord];
+    BOOST_CHECK(bool(inner));
+    BOOST_CHECK_EQUAL(inner.size(), 8u);
+    auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
+    BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(zero_volume_shapes_yield_null_inners) {
+  TA::Range outer_r{4};
+  auto shape_fn = [](std::size_t ord) {
+    return ord % 2 == 0 ? TA::Range{4} : TA::Range();
+  };
+  Outer outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    Inner& inner = outer.data()[ord];
+    if (ord % 2 == 0) {
+      BOOST_CHECK(bool(inner));
+      BOOST_CHECK_EQUAL(inner.size(), 4u);
+    } else {
+      BOOST_CHECK(!inner);
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE(non_null_cells_share_one_monotonic_slab) {
+  TA::Range outer_r{6};
+  auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{6}; };
+  Outer outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  const double* prev_end = nullptr;
+  for (std::size_t ord = 0; ord < 6; ++ord) {
+    Inner& inner = outer.data()[ord];
+    const double* begin = inner.data();
+    const double* end = begin + inner.size();
+    if (prev_end != nullptr) {
+      BOOST_CHECK(begin >= prev_end);
+      // Gap bounded by one cell stride (cache-line-floor or SIMD-driven).
+      const std::size_t gap =
+          static_cast<std::size_t>(begin - prev_end) * sizeof(double);
+      BOOST_CHECK_LE(gap,
+                     TA::detail::kArenaCachelineAlign + Inner::cell_size(0));
+    }
+    prev_end = end;
+  }
+}
+
+BOOST_AUTO_TEST_CASE(outer_outlives_factory_scope) {
+  Outer outer;
+  {
+    TA::Range outer_r{3};
+    auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{4}; };
+    outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  }
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    Inner& inner = outer.data()[ord];
+    TA::fill(inner, double(ord + 1));
+  }
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    Inner& inner = outer.data()[ord];
+    for (std::size_t i = 0; i < inner.size(); ++i)
+      BOOST_CHECK_EQUAL(inner.data()[i], double(ord + 1));
+  }
+}
+
+BOOST_AUTO_TEST_CASE(jagged_inner_shapes_round_trip) {
+  TA::Range outer_r{4};
+  std::vector<long> sizes = {3, 5, 0, 7};
+  auto shape_fn = [&](std::size_t ord) {
+    return sizes[ord] == 0 ? TA::Range() : TA::Range{sizes[ord]};
+  };
+  Outer outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    Inner& inner = outer.data()[ord];
+    if (sizes[ord] == 0) {
+      BOOST_CHECK(!inner);
+    } else {
+      BOOST_REQUIRE(bool(inner));
+      BOOST_CHECK_EQUAL(inner.size(), static_cast<std::size_t>(sizes[ord]));
+      auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
+      BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE(empty_outer_range_yields_no_slab) {
+  TA::Range outer_r{0};
+  auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{4}; };
+  Outer outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  BOOST_CHECK_EQUAL(outer.range().volume(), 0u);
+}
+
+BOOST_AUTO_TEST_CASE(all_null_outer_works) {
+  TA::Range outer_r{5};
+  auto shape_fn = [](std::size_t /*ord*/) { return TA::Range(); };
+  Outer outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  for (std::size_t ord = 0; ord < 5; ++ord) BOOST_CHECK(!outer.data()[ord]);
+}
+
+namespace {
+
+/// Build an outer with uniform inners filled by an ordinal-dependent rule.
+Outer make_outer(std::size_t n_outer, std::size_t n_inner, double base) {
+  TA::Range outer_r{static_cast<long>(n_outer)};
+  auto shape_fn = [n_inner](std::size_t /*ord*/) {
+    return TA::Range{static_cast<long>(n_inner)};
+  };
+  Outer outer = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  for (std::size_t ord = 0; ord < n_outer; ++ord) {
+    Inner& inner = outer.data()[ord];
+    for (std::size_t i = 0; i < inner.size(); ++i)
+      inner.data()[i] = base + ord * 100.0 + i;
+  }
+  return outer;
+}
+
+bool outers_equal(const Outer& a, const Outer& b) {
+  if (a.range().volume() != b.range().volume()) return false;
+  for (std::size_t ord = 0; ord < a.range().volume(); ++ord) {
+    const Inner& ai = a.data()[ord];
+    const Inner& bi = b.data()[ord];
+    if (bool(ai) != bool(bi)) return false;
+    if (!ai) continue;
+    if (ai.size() != bi.size()) return false;
+    for (std::size_t i = 0; i < ai.size(); ++i)
+      if (ai.data()[i] != bi.data()[i]) return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+BOOST_AUTO_TEST_CASE(arena_tensor_is_a_tensor_but_a_view) {
+  // ArenaTensor is registered as is_tensor_helper / is_contiguous_tensor so
+  // kernel paths treat it like Tensor<double>; the `is_tensor_view` trait
+  // opts it out of value-returning member-call paths (which require
+  // allocation a view cannot do).
+  static_assert(TA::detail::is_tensor_helper<Inner>::value);
+  static_assert(TA::detail::is_contiguous_tensor<Inner>::value);
+  static_assert(TA::detail::is_tensor<Inner>::value);
+  static_assert(TA::is_tensor_view_v<Inner>);
+  static_assert(TA::is_arena_tensor_v<Inner>);
+  // ta_ops_match_tensor (value-returning ops gate) is now false for views.
+  static_assert(!TA::detail::ta_ops_match_tensor_v<Inner>);
+  // ta_ops_match_tensor_inplace (in-place ops gate) is true.
+  static_assert(TA::detail::ta_ops_match_tensor_inplace_v<Inner>);
+  BOOST_CHECK(true);
+}
+
+BOOST_AUTO_TEST_CASE(trivial_clone_inner_round_trip) {
+  Outer src = make_outer(4, 5, 1.0);
+  Outer copy = src.clone();
+  BOOST_CHECK(outers_equal(copy, src));
+  // Independent slab: mutating copy doesn't affect src.
+  copy.data()[0].data()[0] = -1.0;
+  BOOST_CHECK_EQUAL(src.data()[0].data()[0], 1.0);
+}
+
+BOOST_AUTO_TEST_CASE(trivial_scale_inner_multiplies) {
+  Outer src = make_outer(3, 4, 1.0);
+  Outer scaled = src.scale(2.5);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& sinner = src.data()[ord];
+    const Inner& dinner = scaled.data()[ord];
+    BOOST_REQUIRE_EQUAL(dinner.size(), sinner.size());
+    for (std::size_t i = 0; i < sinner.size(); ++i)
+      BOOST_CHECK_EQUAL(dinner.data()[i], sinner.data()[i] * 2.5);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(trivial_add_inner_accumulates) {
+  Outer L = make_outer(3, 4, 1.0);
+  Outer R = make_outer(3, 4, 0.5);
+  Outer sum = L.add(R);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& l = L.data()[ord];
+    const Inner& r = R.data()[ord];
+    const Inner& d = sum.data()[ord];
+    for (std::size_t i = 0; i < l.size(); ++i)
+      BOOST_CHECK_EQUAL(d.data()[i], l.data()[i] + r.data()[i]);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(trivial_subt_inner_subtracts) {
+  Outer L = make_outer(3, 4, 5.0);
+  Outer R = make_outer(3, 4, 1.0);
+  Outer diff = L.subt(R);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& l = L.data()[ord];
+    const Inner& r = R.data()[ord];
+    const Inner& d = diff.data()[ord];
+    for (std::size_t i = 0; i < l.size(); ++i)
+      BOOST_CHECK_EQUAL(d.data()[i], l.data()[i] - r.data()[i]);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(trivial_mult_inner_elementwise) {
+  Outer L = make_outer(3, 4, 2.0);
+  Outer R = make_outer(3, 4, 0.5);
+  Outer prod = L.mult(R);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& l = L.data()[ord];
+    const Inner& r = R.data()[ord];
+    const Inner& d = prod.data()[ord];
+    for (std::size_t i = 0; i < l.size(); ++i)
+      BOOST_CHECK_EQUAL(d.data()[i], l.data()[i] * r.data()[i]);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(contraction_arena_plan_reserve_and_construct_inner) {
+  // Verify ContractionArenaPlan's inner-tensor dispatch builds the right
+  // outer/inner shapes and SIMD-aligns each non-null inner cell.
+  TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose,
+                                TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose,
+                                TA::math::blas::NoTranspose, 2, 2, 2);
+  auto left = TA::detail::arena_outer_init<Outer>(TA::Range{2, 3}, 1,
+                                                  [](std::size_t /*ord*/) {
+                                                    return TA::Range{4, 5};
+                                                  });
+  auto right = TA::detail::arena_outer_init<Outer>(TA::Range{3, 4}, 1,
+                                                   [](std::size_t /*ord*/) {
+                                                     return TA::Range{5, 6};
+                                                   });
+  TA::detail::ArenaInnerShapePlan inner_plan{
+      TA::detail::ArenaInnerShapeKind::gemm_result_range,
+      std::make_optional(inner_gh)};
+  TA::detail::ContractionArenaPlan<Outer, Outer, Outer> plan(inner_plan);
+  Outer result = plan.reserve_and_construct(left, right, outer_gh);
+  // Outer result: 2x4 = 8 cells; each inner: 4x6 = 24 elements.
+  BOOST_REQUIRE_EQUAL(result.range().volume(), 8u);
+  for (std::size_t ord = 0; ord < 8; ++ord) {
+    const Inner& inner = result.data()[ord];
+    BOOST_REQUIRE(bool(inner));
+    BOOST_CHECK_EQUAL(inner.size(), 24u);
+    auto addr = reinterpret_cast<std::uintptr_t>(inner.data());
+    BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(outer_gemm_with_arena_tensor_contraction) {
+  // End-to-end: arena-allocate result via the plan, then run TA::Tensor's
+  // outer gemm with a custom elem_muladd_op that calls the free gemm CPO
+  // for ArenaTensor inners. Verifies the full chain reserve_and_construct
+  // -> outer iteration -> inner BLAS gemm.
+  TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose,
+                                TA::math::blas::NoTranspose, 2, 2, 2);
+  TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose,
+                                TA::math::blas::NoTranspose, 2, 2, 2);
+  // A[2,3] outer of <4,5> inners (each 1.0); B[3,4] outer of <5,6> inners
+  // (each 2.0). C[2,4] outer of <4,6> inners; each entry =
+  //   sum over outer k in [0,3) of sum over inner k in [0,5) of 1.0*2.0
+  //   = 3 * 5 * 2.0 = 30.0
+  auto left = TA::detail::arena_outer_init<Outer>(TA::Range{2, 3}, 1,
+                                                  [](std::size_t /*ord*/) {
+                                                    return TA::Range{4, 5};
+                                                  });
+  auto right = TA::detail::arena_outer_init<Outer>(TA::Range{3, 4}, 1,
+                                                   [](std::size_t /*ord*/) {
+                                                     return TA::Range{5, 6};
+                                                   });
+  for (std::size_t i = 0; i < left.range().volume(); ++i)
+    TA::fill(left.data()[i], 1.0);
+  for (std::size_t i = 0; i < right.range().volume(); ++i)
+    TA::fill(right.data()[i], 2.0);
+
+  TA::detail::ArenaInnerShapePlan inner_plan{
+      TA::detail::ArenaInnerShapeKind::gemm_result_range,
+      std::make_optional(inner_gh)};
+  TA::detail::ContractionArenaPlan<Outer, Outer, Outer> plan(inner_plan);
+  Outer result = plan.reserve_and_construct(left, right, outer_gh);
+
+  auto elem_muladd = [&inner_gh](Inner& r, const Inner& l, const Inner& rr) {
+    TA::gemm(r, l, rr, 1.0, inner_gh);
+  };
+  result.gemm(left, right, outer_gh, elem_muladd);
+
+  for (std::size_t ord = 0; ord < result.range().volume(); ++ord) {
+    const Inner& inner = result.data()[ord];
+    BOOST_REQUIRE(bool(inner));
+    BOOST_REQUIRE_EQUAL(inner.size(), 24u);
+    for (std::size_t e = 0; e < 24; ++e)
+      BOOST_CHECK_CLOSE(inner.data()[e], 30.0, 1e-12);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(trivial_ops_preserve_null_cells) {
+  // Outer with mixed null and non-null inners; trivial ops should propagate
+  // null cells through to the result.
+  TA::Range outer_r{4};
+  auto shape_fn = [](std::size_t ord) {
+    return ord % 2 == 0 ? TA::Range{4} : TA::Range();
+  };
+  Outer src = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    Inner& inner = src.data()[ord];
+    if (inner) {
+      for (std::size_t i = 0; i < inner.size(); ++i) inner.data()[i] = 1.0;
+    }
+  }
+  Outer scaled = src.scale(3.0);
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    const Inner& d = scaled.data()[ord];
+    if (ord % 2 == 0) {
+      BOOST_REQUIRE(bool(d));
+      for (std::size_t i = 0; i < d.size(); ++i)
+        BOOST_CHECK_EQUAL(d.data()[i], 3.0);
+    } else {
+      BOOST_CHECK(!d);
+    }
+  }
+}
+
+// Outer-tile serialize round-trip: exercises the arena-aware path in
+// TA::Tensor::serialize directly via an in-memory archive. The slab
+// gets rebuilt on load.
+BOOST_AUTO_TEST_CASE(outer_tile_serialize_round_trip_arena_tensor) {
+  // Build an outer with jagged inner shapes including one null cell.
+  Outer src =
+      TA::detail::arena_outer_init<Outer>(TA::Range{4}, 1, [](std::size_t ord) {
+        if (ord == 2) return TA::Range();  // null cell
+        return TA::Range{static_cast<long>(3 + ord)};
+      });
+  // Fill non-null cells with ord-dependent values.
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    Inner& cell = src.data()[ord];
+    if (cell) {
+      for (std::size_t i = 0; i < cell.size(); ++i)
+        cell.data()[i] = double(ord * 100 + i);
+    }
+  }
+
+  const std::size_t buf_size = 1 << 16;
+  std::vector<unsigned char> buf(buf_size);
+  madness::archive::BufferOutputArchive oar(buf.data(), buf_size);
+  BOOST_REQUIRE_NO_THROW(oar & src);
+  const std::size_t nbyte = oar.size();
+  oar.close();
+
+  Outer dst;
+  madness::archive::BufferInputArchive iar(buf.data(), nbyte);
+  BOOST_REQUIRE_NO_THROW(iar & dst);
+  iar.close();
+
+  // Verify outer shape, null/non-null flags, inner shapes, element values.
+  BOOST_REQUIRE_EQUAL(dst.range().volume(), src.range().volume());
+  for (std::size_t ord = 0; ord < src.range().volume(); ++ord) {
+    const Inner& s = src.data()[ord];
+    const Inner& d = dst.data()[ord];
+    BOOST_REQUIRE_EQUAL(bool(s), bool(d));
+    if (!s) continue;
+    BOOST_REQUIRE_EQUAL(s.size(), d.size());
+    for (std::size_t i = 0; i < s.size(); ++i)
+      BOOST_CHECK_EQUAL(d.data()[i], s.data()[i]);
+    // The loaded cell's data pointer is SIMD-aligned via
+    // arena_outer_init.
+    auto addr = reinterpret_cast<std::uintptr_t>(d.data());
+    BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u);
+  }
+}
+
+// DistArray-level test: forces `TA::DistArray<TA::Tensor<ArenaTensor>>` to
+// instantiate, exercising arena-aware serialization at the outer-tile
+// boundary. Serial-only (no @distributed).
+BOOST_AUTO_TEST_CASE(distarray_arena_tensor_construct_and_init_tiles) {
+  using Array = TA::DistArray<Outer, TA::DensePolicy>;
+  auto& world = TA::get_default_world();
+  TA::TiledRange tr{TA::TiledRange1{0, 2, 4}};
+  Array A(world, tr);
+  A.init_tiles([](const TA::Range& tile_range) {
+    return TA::detail::arena_outer_init<Outer>(
+        tile_range, 1, [](std::size_t /*ord*/) { return TA::Range{3}; });
+  });
+  world.gop.fence();
+  BOOST_CHECK_EQUAL(A.trange().tiles_range().volume(), 2u);
+  if (A.is_local(0)) {
+    Outer tile = A.find(0).get();
+    BOOST_CHECK_EQUAL(tile.range().volume(), 2u);
+    for (std::size_t i = 0; i < tile.range().volume(); ++i) {
+      const Inner& cell = tile.data()[i];
+      BOOST_REQUIRE(bool(cell));
+      BOOST_CHECK_EQUAL(cell.size(), 3u);
+    }
+  }
+}
+
+// Mixed scalar/ArenaTensor outer Hadamard: each scalar-side outer cell
+// multiplies the corresponding ArenaTensor-side inner element-wise.
+// Exercises Tensor<ArenaTensor>::mult(Tensor<scalar>) and the symmetric
+// Tensor<scalar>::mult(Tensor<ArenaTensor>).
+BOOST_AUTO_TEST_CASE(mixed_outer_mult_scalar_times_arena) {
+  using Scalars = TA::Tensor<double>;
+  // 3 outer cells, each inner of size 4, base value 1.0 + ord*100 + i.
+  Outer A = make_outer(3, 4, 1.0);
+  Scalars S(TA::Range{3});
+  S.at_ordinal(0) = 2.0;
+  S.at_ordinal(1) = -1.5;
+  S.at_ordinal(2) = 0.25;
+
+  // Tensor<ArenaTensor> * Tensor<scalar>
+  Outer prod_as = A.mult(S);
+  BOOST_REQUIRE_EQUAL(prod_as.range().volume(), 3u);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& a = A.data()[ord];
+    const Inner& d = prod_as.data()[ord];
+    BOOST_REQUIRE(bool(d));
+    BOOST_REQUIRE_EQUAL(d.size(), a.size());
+    // Result must be independent of the source slab.
+    BOOST_CHECK_NE(d.data(), a.data());
+    for (std::size_t i = 0; i < a.size(); ++i)
+      BOOST_CHECK_CLOSE(d.data()[i], a.data()[i] * S.at_ordinal(ord), 1e-12);
+  }
+
+  // Tensor<scalar> * Tensor<ArenaTensor>
+  Outer prod_sa = S.mult(A);
+  BOOST_REQUIRE_EQUAL(prod_sa.range().volume(), 3u);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& a = A.data()[ord];
+    const Inner& d = prod_sa.data()[ord];
+    BOOST_REQUIRE(bool(d));
+    BOOST_REQUIRE_EQUAL(d.size(), a.size());
+    for (std::size_t i = 0; i < a.size(); ++i)
+      BOOST_CHECK_CLOSE(d.data()[i], S.at_ordinal(ord) * a.data()[i], 1e-12);
+  }
+}
+
+// Mixed mult preserves null cells coming from the arena side.
+BOOST_AUTO_TEST_CASE(mixed_outer_mult_preserves_null_cells) {
+  using Scalars = TA::Tensor<double>;
+  TA::Range outer_r{4};
+  auto shape_fn = [](std::size_t ord) {
+    return ord % 2 == 0 ? TA::Range{4} : TA::Range();
+  };
+  Outer A = TA::detail::arena_outer_init<Outer>(outer_r, 1, shape_fn);
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    Inner& inner = A.data()[ord];
+    if (inner)
+      for (std::size_t i = 0; i < inner.size(); ++i) inner.data()[i] = 1.0;
+  }
+  Scalars S(TA::Range{4});
+  S.at_ordinal(0) = 3.0;
+  S.at_ordinal(1) = 7.0;
+  S.at_ordinal(2) = -2.0;
+  S.at_ordinal(3) = 11.0;
+
+  Outer prod = A.mult(S);
+  for (std::size_t ord = 0; ord < 4; ++ord) {
+    const Inner& d = prod.data()[ord];
+    if (ord % 2 == 0) {
+      BOOST_REQUIRE(bool(d));
+      for (std::size_t i = 0; i < d.size(); ++i)
+        BOOST_CHECK_CLOSE(d.data()[i], 1.0 * S.at_ordinal(ord), 1e-12);
+    } else {
+      BOOST_CHECK(!d);
+    }
+  }
+}
+
+// Mixed scalar/ArenaTensor add/subt: scalar broadcast across each inner.
+BOOST_AUTO_TEST_CASE(mixed_outer_add_subt_scalar_and_arena) {
+  using Scalars = TA::Tensor<double>;
+  Outer A = make_outer(3, 4, 1.0);
+  Scalars S(TA::Range{3});
+  S.at_ordinal(0) = 10.0;
+  S.at_ordinal(1) = -2.0;
+  S.at_ordinal(2) = 0.5;
+
+  // ToT + scalar  → broadcast scalar across each inner element.
+  Outer sum_as = A.add(S);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& a = A.data()[ord];
+    const Inner& d = sum_as.data()[ord];
+    BOOST_REQUIRE(bool(d));
+    for (std::size_t i = 0; i < a.size(); ++i)
+      BOOST_CHECK_CLOSE(d.data()[i], a.data()[i] + S.at_ordinal(ord), 1e-12);
+  }
+  // scalar + ToT  → symmetric.
+  Outer sum_sa = S.add(A);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& a = A.data()[ord];
+    const Inner& d = sum_sa.data()[ord];
+    BOOST_REQUIRE(bool(d));
+    for (std::size_t i = 0; i < a.size(); ++i)
+      BOOST_CHECK_CLOSE(d.data()[i], S.at_ordinal(ord) + a.data()[i], 1e-12);
+  }
+  // ToT - scalar  → subtract per-cell scalar.
+  Outer diff_as = A.subt(S);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& a = A.data()[ord];
+    const Inner& d = diff_as.data()[ord];
+    BOOST_REQUIRE(bool(d));
+    for (std::size_t i = 0; i < a.size(); ++i)
+      BOOST_CHECK_CLOSE(d.data()[i], a.data()[i] - S.at_ordinal(ord), 1e-12);
+  }
+  // scalar - ToT.
+  Outer diff_sa = S.subt(A);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& a = A.data()[ord];
+    const Inner& d = diff_sa.data()[ord];
+    BOOST_REQUIRE(bool(d));
+    for (std::size_t i = 0; i < a.size(); ++i)
+      BOOST_CHECK_CLOSE(d.data()[i], S.at_ordinal(ord) - a.data()[i], 1e-12);
+  }
+}
+
+// `Tensor<ArenaTensor>` should support the same reductions as a flat tensor
+// (sum / product / squared_norm / min / max), routed through TA's ToT
+// reduce path via the `is_tensor_of_tensor_helper` extension.
+// Sanity-check the trait flip:
+//  - is_arena_tensor_v<ArenaTensor<...>> must be true
+//  - is_tensor_of_tensor_v<Tensor<ArenaTensor>> must be true (was false)
+//  - is_tensor_v<Tensor<ArenaTensor>> must be false (was true)
+static_assert(TA::is_arena_tensor_v<Inner>);
+static_assert(TA::detail::is_tensor_of_tensor_v<Outer>);
+static_assert(!TA::detail::is_tensor_v<Outer>);
+// And view-aware in-place ops must work for Tensor<ArenaTensor>.
+// Confirm prerequisite traits hold:
+static_assert(TA::is_tensor_view_v<Inner>, "ArenaTensor must be a tensor view");
+static_assert(TA::is_tensor_view_v<typename Outer::value_type>,
+              "Outer's value_type (ArenaTensor) must be a tensor view");
+
+// Spot-check that the legacy in-place ops which use `is_tensor<Right>`
+// SFINAE *do not* match for `Tensor<ArenaTensor>` after the trait flip.
+// If they did, instantiating them would fail (no operator-= on
+// ArenaTensor). Probe via `has_member_function_subt_to_anyreturn_v`.
+
+// Smoke test: in-place ops on Tensor<ArenaTensor> compile and execute.
+BOOST_AUTO_TEST_CASE(tot_inplace_ops_smoketest) {
+  Outer a = TA::detail::arena_outer_init<Outer>(
+      TA::Range{2}, 1, [](std::size_t) { return TA::Range{3}; });
+  for (std::size_t ord = 0; ord < 2; ++ord)
+    for (std::size_t i = 0; i < 3; ++i) a.data()[ord].data()[i] = 1.0;
+  Outer b = TA::detail::arena_outer_init<Outer>(
+      TA::Range{2}, 1, [](std::size_t) { return TA::Range{3}; });
+  for (std::size_t ord = 0; ord < 2; ++ord)
+    for (std::size_t i = 0; i < 3; ++i) b.data()[ord].data()[i] = 2.0;
+  a.add_to(b);  // expect 3.0 elements
+  for (std::size_t ord = 0; ord < 2; ++ord)
+    for (std::size_t i = 0; i < 3; ++i)
+      BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 3.0, 1e-12);
+  a.subt_to(b);  // back to 1.0
+  for (std::size_t ord = 0; ord < 2; ++ord)
+    for (std::size_t i = 0; i < 3; ++i)
+      BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 1.0, 1e-12);
+  a.mult_to(b);  // 2.0
+  for (std::size_t ord = 0; ord < 2; ++ord)
+    for (std::size_t i = 0; i < 3; ++i)
+      BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 2.0, 1e-12);
+  a.scale_to(0.5);  // 1.0
+  for (std::size_t ord = 0; ord < 2; ++ord)
+    for (std::size_t i = 0; i < 3; ++i)
+      BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 1.0, 1e-12);
+  a.neg_to();  // -1.0
+  for (std::size_t ord = 0; ord < 2; ++ord)
+    for (std::size_t i = 0; i < 3; ++i)
+      BOOST_CHECK_CLOSE(a.data()[ord].data()[i], -1.0, 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(tot_reductions_match_flat_aggregate) {
+  using Inner = TA::ArenaTensor<double, TA::Range>;
+  using Outer = TA::Tensor<Inner>;
+  Outer a = TA::detail::arena_outer_init<Outer>(
+      TA::Range{3}, 1, [](std::size_t /*ord*/) { return TA::Range{4}; });
+  double expected_sum = 0.0;
+  double expected_product = 1.0;
+  double expected_sq_norm = 0.0;
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    Inner& inner = a.data()[ord];
+    for (std::size_t i = 0; i < inner.size(); ++i) {
+      const double v = 1.0 + ord * 10.0 + i;  // deterministic, all positive
+      inner.data()[i] = v;
+      expected_sum += v;
+      expected_product *= v;
+      expected_sq_norm += v * v;
+    }
+  }
+  BOOST_CHECK_CLOSE(a.sum(), expected_sum, 1e-12);
+  BOOST_CHECK_CLOSE(a.product(), expected_product, 1e-12);
+  BOOST_CHECK_CLOSE(a.squared_norm(), expected_sq_norm, 1e-12);
+  BOOST_CHECK_CLOSE(a.norm(), std::sqrt(expected_sq_norm), 1e-12);
+}
+
+// axpy_to on Tensor<ArenaTensor>: verifies axpy semantics
+// (factor scales only the added operand, not the existing result) —
+// distinct from add_to(right, factor) which is `(result + right) * factor`.
+BOOST_AUTO_TEST_CASE(tot_axpy_to_accumulates_scaled_operand) {
+  Outer result = make_outer(3, 4, 10.0);
+  std::vector<std::vector<double>> initial(3, std::vector<double>(4));
+  for (std::size_t ord = 0; ord < 3; ++ord)
+    for (std::size_t i = 0; i < 4; ++i)
+      initial[ord][i] = result.data()[ord].data()[i];
+  Outer arg = make_outer(3, 4, 1.0);
+  const double factor = 0.5;
+  using TiledArray::axpy_to;
+  axpy_to(result, arg, factor);
+  for (std::size_t ord = 0; ord < 3; ++ord) {
+    const Inner& a = arg.data()[ord];
+    const Inner& d = result.data()[ord];
+    for (std::size_t i = 0; i < a.size(); ++i)
+      BOOST_CHECK_CLOSE(d.data()[i], initial[ord][i] + a.data()[i] * factor,
+                        1e-12);
+  }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/arena_tot_trivial.cpp b/tests/arena_tot_trivial.cpp
new file mode 100644
index 0000000000..627bd5a7cc
--- /dev/null
+++ b/tests/arena_tot_trivial.cpp
@@ -0,0 +1,144 @@
+/// Arena-aware ToT trivial-op end-to-end tests (add, subt, mult, scale, clone).
+
+#include "TiledArray/tensor.h"
+#include "tiledarray.h"
+#include "unit_test_config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace TA = TiledArray;
+using inner_t = TA::Tensor<double>;
+using outer_t = TA::Tensor<inner_t>;
+
+namespace {
+
+outer_t make_tot(std::size_t N_outer, std::size_t n_inner, double base = 1.0) {
+  outer_t outer(TA::Range{static_cast<long>(N_outer)}, 1);
+  for (std::size_t ord = 0; ord < N_outer; ++ord) {
+    inner_t inner(TA::Range{static_cast<long>(n_inner)});
+    for (std::size_t i = 0; i < n_inner; ++i)
+      inner.at_ordinal(i) = base + ord * 100.0 + i;
+    *(outer.data() + ord) = std::move(inner);
+  }
+  return outer;
+}
+
+bool tot_equal(const outer_t& a, const outer_t& b) {
+  if (a.range().volume() != b.range().volume()) return false;
+  for (std::size_t ord = 0; ord < a.range().volume(); ++ord) {
+    const inner_t& ai = *(a.data() + ord);
+    const inner_t& bi = *(b.data() + ord);
+    if (ai.range().volume() != bi.range().volume()) return false;
+    for (std::size_t i = 0; i < ai.range().volume(); ++i)
+      if (ai.at_ordinal(i) != bi.at_ordinal(i)) return false;
+  }
+  return true;
+}
+
+/// All inner cells point into one contiguous slab (monotonic with bounded gap).
+bool inners_share_one_slab(const outer_t& tot) {
+  if (tot.range().volume() == 0) return true;
+  const double* prev_end = nullptr;
+  for (std::size_t ord = 0; ord < tot.range().volume(); ++ord) {
+    const inner_t& cell = *(tot.data() + ord);
+    if (cell.range().volume() == 0) continue;
+    const double* cell_begin = cell.data();
+    const double* cell_end = cell_begin + cell.range().volume();
+    if (prev_end != nullptr && cell_begin < prev_end) return false;
+    if (prev_end != nullptr &&
+        static_cast<std::size_t>(cell_begin - prev_end) > 1024)
+      return false;
+    prev_end = cell_end;
+  }
+  return true;
+}
+
+}
+
+BOOST_AUTO_TEST_SUITE(arena_tot_trivial_suite, TA_UT_LABEL_SERIAL)
+
+BOOST_AUTO_TEST_CASE(scale_bit_equal_and_one_slab) {
+  outer_t src = make_tot(6, 8, 1.0);
+  outer_t arena_result = src.scale(2.5);
+  outer_t baseline(src.range(), 1);
+  for (std::size_t ord = 0; ord < src.range().volume(); ++ord) {
+    inner_t inner((src.data() + ord)->range());
+    for (std::size_t i = 0; i < inner.range().volume(); ++i)
+      inner.at_ordinal(i) = (src.data() + ord)->at_ordinal(i) * 2.5;
+    *(baseline.data() + ord) = std::move(inner);
+  }
+  BOOST_CHECK(tot_equal(arena_result, baseline));
+  BOOST_CHECK(inners_share_one_slab(arena_result));
+}
+
+BOOST_AUTO_TEST_CASE(clone_bit_equal_and_one_slab) {
+  outer_t src = make_tot(6, 8, 3.0);
+  outer_t arena_result = src.clone();
+  BOOST_CHECK(tot_equal(arena_result, src));
+  BOOST_CHECK(inners_share_one_slab(arena_result));
+}
+
+BOOST_AUTO_TEST_CASE(add_bit_equal_and_one_slab) {
+  outer_t L = make_tot(6, 8, 1.0);
+  outer_t R = make_tot(6, 8, 0.5);
+  outer_t arena_result = L.add(R);
+  outer_t baseline(L.range(), 1);
+  for (std::size_t ord = 0; ord < L.range().volume(); ++ord) {
+    inner_t inner((L.data() + ord)->range());
+    for (std::size_t i = 0; i < inner.range().volume(); ++i)
+      inner.at_ordinal(i) = (L.data() + ord)->at_ordinal(i) +
+                            (R.data() + ord)->at_ordinal(i);
+    *(baseline.data() + ord) = std::move(inner);
+  }
+  BOOST_CHECK(tot_equal(arena_result, baseline));
+  BOOST_CHECK(inners_share_one_slab(arena_result));
+}
+
+BOOST_AUTO_TEST_CASE(subt_bit_equal_and_one_slab) {
+  outer_t L = make_tot(6, 8, 5.0);
+  outer_t R = make_tot(6, 8, 1.0);
+  outer_t arena_result = L.subt(R);
+  outer_t baseline(L.range(), 1);
+  for (std::size_t ord = 0; ord < L.range().volume(); ++ord) {
+    inner_t inner((L.data() + ord)->range());
+    for (std::size_t i = 0; i < inner.range().volume(); ++i)
+      inner.at_ordinal(i) = (L.data() + ord)->at_ordinal(i) -
+                            (R.data() + ord)->at_ordinal(i);
+    *(baseline.data() + ord) = std::move(inner);
+  }
+  BOOST_CHECK(tot_equal(arena_result, baseline));
+  BOOST_CHECK(inners_share_one_slab(arena_result));
+}
+
+BOOST_AUTO_TEST_CASE(mult_elementwise_bit_equal_and_one_slab) {
+  outer_t L = make_tot(6, 8, 2.0);
+  outer_t R = make_tot(6, 8, 0.5);
+  outer_t arena_result = L.mult(R);
+  outer_t baseline(L.range(), 1);
+  for (std::size_t ord = 0; ord < L.range().volume(); ++ord) {
+    inner_t inner((L.data() + ord)->range());
+    for (std::size_t i = 0; i < inner.range().volume(); ++i)
+      inner.at_ordinal(i) = (L.data() + ord)->at_ordinal(i) *
+                            (R.data() + ord)->at_ordinal(i);
+    *(baseline.data() + ord) = std::move(inner);
+  }
+  BOOST_CHECK(tot_equal(arena_result, baseline));
+  BOOST_CHECK(inners_share_one_slab(arena_result));
+}
+
+BOOST_AUTO_TEST_CASE(arena_outlives_source) {
+  outer_t arena_result;
+  {
+    outer_t src = make_tot(3, 4, 9.0);
+    arena_result = src.scale(2.0);
+  }
+  for (std::size_t ord = 0; ord < arena_result.range().volume(); ++ord)
+    for (std::size_t i = 0; i < (arena_result.data() + ord)->range().volume();
+         ++i)
+      BOOST_CHECK_EQUAL((arena_result.data() + ord)->at_ordinal(i),
+                        (9.0 + ord * 100.0 + i) * 2.0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/cases/CMakeLists.txt b/tests/cases/CMakeLists.txt
new file mode 100644
index 0000000000..8cc4721163
--- /dev/null
+++ b/tests/cases/CMakeLists.txt
@@ -0,0 +1,15 @@
+# hec_* + 4d_e per-cell case binaries (arena vs heap).
+
+set(_cases
+  case_hec_h
+  case_hec_e
+  case_hec_ec
+  case_hec_scale
+  case_4d_e
+)
+
+foreach(_case ${_cases})
+  add_ta_executable(${_case} "${_case}.cpp" "tiledarray")
+  target_include_directories(${_case} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+  add_dependencies(test-cases-tiledarray ${_case})
+endforeach()
diff --git a/tests/cases/case_4d_e.cpp b/tests/cases/case_4d_e.cpp
new file mode 100644
index 0000000000..df1f47b6ea
--- /dev/null
+++ b/tests/cases/case_4d_e.cpp
@@ -0,0 +1,61 @@
+/// 4d_e: outer-4D x outer-3D with one Hadamard, one contracted, three free.
+
+#include "case_common.h"
+
+#include <cmath>
+
+namespace c = cases;
+
+namespace {
+
+/// Deterministic truncated-exponential inner-size, mean ~10, cap 50.
+inline long a_size(long p, long q) {
+  unsigned long h =
+      (static_cast<unsigned long>(p) * 73ULL +
+       static_cast<unsigned long>(q) * 113ULL + 17ULL) * 2654435761ULL;
+  double u = static_cast<double>(h & 0x7FFFFFFFUL) /
+             static_cast<double>(0x80000000UL);
+  double x = -10.0 * std::log(1.0 - u);
+  if (x > 50.0) x = 50.0;
+  return static_cast<long>(x);
+}
+
+}  // namespace
+
+struct Ops {
+  c::ToT lhs;
+  c::ToT rhs;
+};
+
+int main(int argc, char** argv) {
+  constexpr int I = 20;
+  constexpr int M = 50;
+  constexpr int K = 100;
+
+  auto sl = [](long q, long p, long /*m*/, long /*k*/) {
+    return TiledArray::Range{a_size(p, q)};
+  };
+  auto sr = [](long r, long q, long /*m*/) {
+    return TiledArray::Range{a_size(q, r)};
+  };
+
+  return c::run_case_main_split(
+      argc, argv, "4d_e",
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_4d_jagged(w, I, I, M, K, 1.0, sl);
+        ops.rhs = c::make_tot_3d_jagged(w, I, I, M, 100.0, sr);
+        return ops;
+      },
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_4d_jagged_slab(w, I, I, M, K, 1.0, sl);
+        ops.rhs = c::make_tot_3d_jagged_slab(w, I, I, M, 100.0, sr);
+        return ops;
+      },
+      [&](Ops& ops) {
+        return TiledArray::einsum(ops.lhs("q,p,m,k;s"),
+                                   ops.rhs("r,q,m;t"),
+                                   "p,r,q,k;s,t");
+      });
+}
diff --git a/tests/cases/case_common.h b/tests/cases/case_common.h
new file mode 100644
index 0000000000..b097eb56cd
--- /dev/null
+++ b/tests/cases/case_common.h
@@ -0,0 +1,528 @@
+/// Shared bench helpers for arena-vs-heap case binaries.
+
+#pragma once
+
+#include <tiledarray.h>
+#include <TiledArray/einsum/tiledarray.h>
+#include <TiledArray/tensor/arena.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <memory_resource>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace cases {
+
+namespace TA = ::TiledArray;
+
+using inner_t = TA::Tensor<double>;
+using tile_t = TA::Tensor<inner_t>;
+using ToT = TA::DistArray<tile_t, TA::DensePolicy>;
+using Plain = TA::DistArray<inner_t, TA::DensePolicy>;
+
+inline int& g_tile_grid() {
+  static int v = 7;
+  return v;
+}
+
+/// Stores the h-dimension scale set by --h-scale.
+inline int& g_h_scale() {
+  static int v = 1;
+  return v;
+}
+
+inline std::vector<std::size_t> tile_breaks(int n, int ntiles) {
+  if (ntiles <= 1 || n <= 0)
+    return {0, static_cast<std::size_t>(std::max(n, 0))};
+  std::vector<std::size_t> b;
+  b.reserve(ntiles + 1);
+  const int chunk = n / ntiles;
+  for (int t = 0; t < ntiles; ++t) {
+    b.push_back(static_cast<std::size_t>(t * chunk));
+  }
+  b.push_back(static_cast<std::size_t>(n));
+  std::vector<std::size_t> uniq;
+  for (auto x : b)
+    if (uniq.empty() || uniq.back() != x) uniq.push_back(x);
+  return uniq;
+}
+
+inline TA::TiledRange1 tr1_dim(int n) {
+  auto b = tile_breaks(n, g_tile_grid());
+  return TA::TiledRange1(b.begin(), b.end());
+}
+
+inline TA::TiledRange tr3(int a, int b, int c) {
+  return TA::TiledRange{tr1_dim(a), tr1_dim(b), tr1_dim(c)};
+}
+
+inline TA::TiledRange tr4(int a, int b, int c, int d) {
+  return TA::TiledRange{tr1_dim(a), tr1_dim(b), tr1_dim(c), tr1_dim(d)};
+}
+
+/// Builds a 3-D slab-backed jagged ToT.
+template <typename Fn>
+ToT make_tot_3d_jagged_slab(TA::World& world, int A, int B, int C,
+                            double offset, Fn inner_fn) {
+  ToT out(world, tr3(A, B, C));
+  out.init_tiles([offset, inner_fn](const TA::Range& tile_range) {
+    const std::size_t n_cells = tile_range.volume();
+    std::vector<TA::Range> ranges;
+    ranges.reserve(n_cells);
+    std::vector<std::size_t> cell_offsets(n_cells);
+    std::size_t total_elems = 0;
+    {
+      std::size_t ord = 0;
+      for (auto outer_idx : tile_range) {
+        const long o0 = static_cast<long>(outer_idx[0]);
+        const long o1 = static_cast<long>(outer_idx[1]);
+        const long o2 = static_cast<long>(outer_idx[2]);
+        TA::Range ir = inner_fn(o0, o1, o2);
+        cell_offsets[ord] = total_elems;
+        const std::size_t vol = ir.volume();
+        const std::size_t padded = (vol + 7) & ~std::size_t{7};
+        total_elems += padded;
+        ranges.push_back(std::move(ir));
+        ++ord;
+      }
+    }
+
+    std::shared_ptr<double[]> slab;
+    if (total_elems > 0) {
+      void* raw = nullptr;
+      if (posix_memalign(&raw, 64, total_elems * sizeof(double)) != 0) {
+        std::abort();
+      }
+      slab = std::shared_ptr<double[]>(static_cast<double*>(raw),
+                                       [](double* p) { std::free(p); });
+    }
+
+    tile_t tile(tile_range);
+    std::size_t ord = 0;
+    for (auto outer_idx : tile_range) {
+      const long o0 = static_cast<long>(outer_idx[0]);
+      const long o1 = static_cast<long>(outer_idx[1]);
+      const long o2 = static_cast<long>(outer_idx[2]);
+      auto& ir = ranges[ord];
+      const std::size_t vol = ir.volume();
+      if (vol == 0) {
+        *(tile.data() + ord) = inner_t{};
+      } else {
+        std::shared_ptr<double[]> alias(slab,
+                                        slab.get() + cell_offsets[ord]);
+        for (std::size_t k = 0; k < vol; ++k)
+          alias[k] = offset + 1e-4 * static_cast<double>(
+                                          o0 * 100000 + o1 * 1000 + o2 * 100 + k);
+        *(tile.data() + ord) = inner_t(ir, std::move(alias));
+      }
+      ++ord;
+    }
+    return tile;
+  });
+  world.gop.fence();
+  return out;
+}
+
+/// Builds a 3-D heap-scattered jagged ToT.
+template <typename Fn>
+ToT make_tot_3d_jagged(TA::World& world, int A, int B, int C, double offset,
+                       Fn inner_fn) {
+  ToT out(world, tr3(A, B, C));
+  out.init_tiles([offset, inner_fn](const TA::Range& tile_range) {
+    tile_t tile(tile_range);
+    std::size_t ord = 0;
+    for (auto outer_idx : tile_range) {
+      const long o0 = static_cast<long>(outer_idx[0]);
+      const long o1 = static_cast<long>(outer_idx[1]);
+      const long o2 = static_cast<long>(outer_idx[2]);
+      TA::Range ir = inner_fn(o0, o1, o2);
+      const std::size_t vol = ir.volume();
+      if (vol == 0) {
+        *(tile.data() + ord) = inner_t{};
+      } else {
+        inner_t inner(ir);
+        for (std::size_t k = 0; k < vol; ++k)
+          inner.at_ordinal(k) =
+              offset + 1e-4 * static_cast<double>(
+                                  o0 * 100000 + o1 * 1000 + o2 * 100 + k);
+        *(tile.data() + ord) = std::move(inner);
+      }
+      ++ord;
+    }
+    return tile;
+  });
+  world.gop.fence();
+  return out;
+}
+
+/// Builds a 4-D slab-backed jagged ToT.
+template <typename Fn>
+ToT make_tot_4d_jagged_slab(TA::World& world, int A, int B, int C, int D,
+                            double offset, Fn inner_fn) {
+  ToT out(world, tr4(A, B, C, D));
+  out.init_tiles([offset, inner_fn](const TA::Range& tile_range) {
+    const std::size_t n_cells = tile_range.volume();
+    std::vector<TA::Range> ranges;
+    ranges.reserve(n_cells);
+    std::vector<std::size_t> cell_offsets(n_cells);
+    std::size_t total_elems = 0;
+    {
+      std::size_t ord = 0;
+      for (auto outer_idx : tile_range) {
+        const long o0 = static_cast<long>(outer_idx[0]);
+        const long o1 = static_cast<long>(outer_idx[1]);
+        const long o2 = static_cast<long>(outer_idx[2]);
+        const long o3 = static_cast<long>(outer_idx[3]);
+        TA::Range ir = inner_fn(o0, o1, o2, o3);
+        cell_offsets[ord] = total_elems;
+        const std::size_t vol = ir.volume();
+        const std::size_t padded = (vol + 7) & ~std::size_t{7};
+        total_elems += padded;
+        ranges.push_back(std::move(ir));
+        ++ord;
+      }
+    }
+    std::shared_ptr<double[]> slab;
+    if (total_elems > 0) {
+      void* raw = nullptr;
+      if (posix_memalign(&raw, 64, total_elems * sizeof(double)) != 0) {
+        std::abort();
+      }
+      slab = std::shared_ptr<double[]>(static_cast<double*>(raw),
+                                       [](double* p) { std::free(p); });
+    }
+    tile_t tile(tile_range);
+    std::size_t ord = 0;
+    for (auto outer_idx : tile_range) {
+      const long o0 = static_cast<long>(outer_idx[0]);
+      const long o1 = static_cast<long>(outer_idx[1]);
+      const long o2 = static_cast<long>(outer_idx[2]);
+      const long o3 = static_cast<long>(outer_idx[3]);
+      auto& ir = ranges[ord];
+      const std::size_t vol = ir.volume();
+      if (vol == 0) {
+        *(tile.data() + ord) = inner_t{};
+      } else {
+        std::shared_ptr<double[]> alias(slab,
+                                        slab.get() + cell_offsets[ord]);
+        for (std::size_t k = 0; k < vol; ++k)
+          alias[k] = offset + 1e-4 * static_cast<double>(
+                                          o0 * 1000000 + o1 * 10000 +
+                                          o2 * 100 + o3 * 10 + k);
+        *(tile.data() + ord) = inner_t(ir, std::move(alias));
+      }
+      ++ord;
+    }
+    return tile;
+  });
+  world.gop.fence();
+  return out;
+}
+
+/// Builds a 4-D heap-scattered jagged ToT.
+template <typename Fn>
+ToT make_tot_4d_jagged(TA::World& world, int A, int B, int C, int D,
+                       double offset, Fn inner_fn) {
+  ToT out(world, tr4(A, B, C, D));
+  out.init_tiles([offset, inner_fn](const TA::Range& tile_range) {
+    tile_t tile(tile_range);
+    std::size_t ord = 0;
+    for (auto outer_idx : tile_range) {
+      const long o0 = static_cast<long>(outer_idx[0]);
+      const long o1 = static_cast<long>(outer_idx[1]);
+      const long o2 = static_cast<long>(outer_idx[2]);
+      const long o3 = static_cast<long>(outer_idx[3]);
+      TA::Range ir = inner_fn(o0, o1, o2, o3);
+      const std::size_t vol = ir.volume();
+      if (vol == 0) {
+        *(tile.data() + ord) = inner_t{};
+      } else {
+        inner_t inner(ir);
+        for (std::size_t k = 0; k < vol; ++k)
+          inner.at_ordinal(k) =
+              offset + 1e-4 * static_cast<double>(
+                                  o0 * 1000000 + o1 * 10000 + o2 * 100 +
+                                  o3 * 10 + k);
+        *(tile.data() + ord) = std::move(inner);
+      }
+      ++ord;
+    }
+    return tile;
+  });
+  world.gop.fence();
+  return out;
+}
+
+inline Plain make_plain_3d(TA::World& world, int A, int B, int C,
+                           double offset) {
+  Plain out(world, tr3(A, B, C));
+  out.init_tiles([offset](const TA::Range& r) {
+    inner_t tile(r);
+    for (std::size_t k = 0; k < r.volume(); ++k)
+      tile.at_ordinal(k) = offset + 1e-3 * static_cast<double>(k);
+    return tile;
+  });
+  world.gop.fence();
+  return out;
+}
+
+inline double max_abs_diff(const ToT& a, const ToT& b) {
+  if (a.trange() != b.trange()) return 1e30;
+  double mx = 0.0;
+  const auto& tr = a.trange();
+  for (auto t = tr.tiles_range().begin(); t != tr.tiles_range().end(); ++t) {
+    if (!a.is_local(*t)) continue;
+    auto ta = a.find(*t).get();
+    auto tb = b.find(*t).get();
+    if (ta.range().volume() != tb.range().volume()) return 1e30;
+    for (std::size_t ord = 0; ord < ta.range().volume(); ++ord) {
+      const auto& ia = *(ta.data() + ord);
+      const auto& ib = *(tb.data() + ord);
+      if (ia.range().volume() != ib.range().volume()) {
+        if (ia.range().volume() == 0 || ib.range().volume() == 0) {
+          mx = std::max(mx, 1.0);
+          continue;
+        }
+        return 1e30;
+      }
+      for (std::size_t k = 0; k < ia.range().volume(); ++k) {
+        double d = std::abs(ia.at_ordinal(k) - ib.at_ordinal(k));
+        if (d > mx) mx = d;
+      }
+    }
+  }
+  return mx;
+}
+
+struct RunResult {
+  double wall_ns_min = 0.0;
+  double wall_ns_med = 0.0;
+  ToT result;
+  bool ok = true;
+  std::string err;
+};
+
+template <typename Runner>
+RunResult time_run(TA::World& world, Runner&& run, bool disable_arena,
+                   int repeats) {
+  RunResult R;
+  std::vector<double> ns;
+  ns.reserve(repeats);
+  for (int r = 0; r < repeats; ++r) {
+    TA::detail::arena_disabled() = disable_arena;
+    world.gop.fence();
+    auto t0 = std::chrono::steady_clock::now();
+    try {
+      R.result = run();
+      world.gop.fence();
+    } catch (std::exception& e) {
+      R.ok = false;
+      R.err = e.what();
+      return R;
+    } catch (...) {
+      R.ok = false;
+      R.err = "unknown";
+      return R;
+    }
+    auto t1 = std::chrono::steady_clock::now();
+    ns.push_back(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count());
+  }
+  std::sort(ns.begin(), ns.end());
+  R.wall_ns_min = ns.front();
+  R.wall_ns_med = ns[ns.size() / 2];
+  return R;
+}
+
+/// Runs a case binary by building operands once and timing one mode.
+template <typename Build, typename Run>
+int run_case_main(int argc, char** argv, const char* case_name, Build build,
+                  Run run) {
+  // Heap and arena timings must run in separate processes to avoid allocator/cache bias.
+  std::string mode;
+  int repeats = 3;
+  bool quiet = false;
+  int tile_grid = 7;
+  for (int i = 1; i < argc; ++i) {
+    std::string a = argv[i];
+    if (a == "--mode" && i + 1 < argc) {
+      mode = argv[++i];
+    } else if (a == "--repeat" && i + 1 < argc) {
+      repeats = std::atoi(argv[++i]);
+    } else if (a == "--tile-grid" && i + 1 < argc) {
+      tile_grid = std::max(1, std::atoi(argv[++i]));
+    } else if (a == "--h-scale" && i + 1 < argc) {
+      g_h_scale() = std::max(1, std::atoi(argv[++i]));
+    } else if (a == "--quiet") {
+      quiet = true;
+    } else if (a == "-h" || a == "--help") {
+      std::cout
+          << "Usage: " << argv[0]
+          << " --mode {heap|arena} [--tile-grid G] [--h-scale S] "
+             "[--repeat R] [--quiet]\n"
+             "MAD_NUM_THREADS env var controls thread count.\n"
+             "Note: --mode is required. heap and arena MUST be benchmarked\n"
+             "in separate processes — running both in one process biases the\n"
+             "second run via allocator fragmentation and cache residue.\n";
+      return 0;
+    }
+  }
+  if (mode != "heap" && mode != "arena") {
+    std::cerr << "error: --mode must be 'heap' or 'arena' (got '"
+              << mode << "')\n";
+    return 2;
+  }
+  g_tile_grid() = tile_grid;
+
+  TA::World& world = TA::initialize(argc, argv);
+
+  const char* threads_env = std::getenv("MAD_NUM_THREADS");
+  std::string threads_label = threads_env ? threads_env : "default";
+
+  std::cout << "case,mode,tile_grid,threads,wall_ns_min,wall_ns_med,verified\n";
+
+  if (!quiet) {
+    std::cerr << "# " << case_name << " tile_grid=" << tile_grid
+              << " h_scale=" << g_h_scale()
+              << " threads=" << threads_label << "\n";
+  }
+
+  auto operands = build(world);
+
+  auto emit = [&](const char* m, const RunResult& R, const std::string& v) {
+    if (!R.ok) {
+      std::cout << case_name << "," << m << "," << tile_grid << ","
+                << threads_label << ",NA,NA,err:" << R.err << "\n";
+      return;
+    }
+    std::cout << case_name << "," << m << "," << tile_grid << ","
+              << threads_label << "," << static_cast<long long>(R.wall_ns_min)
+              << "," << static_cast<long long>(R.wall_ns_med) << "," << v
+              << "\n";
+  };
+
+  if (mode == "heap") {
+    auto Rh = time_run(
+        world, [&]() { return run(operands); }, true,
+        repeats);
+    emit("heap", Rh, "single");
+    if (!quiet) {
+      std::cerr << "  heap=" << Rh.wall_ns_med / 1e6 << "ms\n";
+    }
+  } else {
+    auto Ra = time_run(
+        world, [&]() { return run(operands); }, false,
+        repeats);
+    emit("arena", Ra, "single");
+    if (!quiet) {
+      std::cerr << "  arena=" << Ra.wall_ns_med / 1e6 << "ms\n";
+    }
+  }
+
+  std::cout.flush();
+  TA::detail::arena_disabled() = false;
+  TA::finalize();
+  return 0;
+}
+
+/// Runs a case binary with separate heap-scatter and arena-slab input builders.
+template <typename BuildHeap, typename BuildArena, typename Run>
+int run_case_main_split(int argc, char** argv, const char* case_name,
+                        BuildHeap build_heap, BuildArena build_arena,
+                        Run run) {
+  // Heap and arena timings must run in separate processes to avoid allocator/cache bias.
+  std::string mode;
+  int repeats = 3;
+  bool quiet = false;
+  int tile_grid = 7;
+  for (int i = 1; i < argc; ++i) {
+    std::string a = argv[i];
+    if (a == "--mode" && i + 1 < argc) {
+      mode = argv[++i];
+    } else if (a == "--repeat" && i + 1 < argc) {
+      repeats = std::atoi(argv[++i]);
+    } else if (a == "--tile-grid" && i + 1 < argc) {
+      tile_grid = std::max(1, std::atoi(argv[++i]));
+    } else if (a == "--h-scale" && i + 1 < argc) {
+      g_h_scale() = std::max(1, std::atoi(argv[++i]));
+    } else if (a == "--quiet") {
+      quiet = true;
+    } else if (a == "-h" || a == "--help") {
+      std::cout << "Usage: " << argv[0]
+                << " --mode {heap|arena} [--tile-grid G] [--h-scale S] "
+                   "[--repeat R] [--quiet]\n"
+                   "Heap mode uses scattered (legacy) inputs; arena mode "
+                   "uses slab-backed inputs.\n"
+                   "Note: --mode is required. heap and arena MUST be "
+                   "benchmarked in separate\n"
+                   "processes — running both in one process biases the "
+                   "second run via allocator\n"
+                   "fragmentation and cache residue.\n";
+      return 0;
+    }
+  }
+  if (mode != "heap" && mode != "arena") {
+    std::cerr << "error: --mode must be 'heap' or 'arena' (got '"
+              << mode << "')\n";
+    return 2;
+  }
+  g_tile_grid() = tile_grid;
+
+  TA::World& world = TA::initialize(argc, argv);
+
+  const char* threads_env = std::getenv("MAD_NUM_THREADS");
+  std::string threads_label = threads_env ? threads_env : "default";
+
+  std::cout << "case,mode,tile_grid,threads,wall_ns_min,wall_ns_med,verified\n";
+
+  if (!quiet) {
+    std::cerr << "# " << case_name << " tile_grid=" << tile_grid
+              << " h_scale=" << g_h_scale()
+              << " threads=" << threads_label
+              << " (split inputs: heap=scatter, arena=slab)\n";
+  }
+
+  auto emit = [&](const char* m, const RunResult& R, const std::string& v) {
+    if (!R.ok) {
+      std::cout << case_name << "," << m << "," << tile_grid << ","
+                << threads_label << ",NA,NA,err:" << R.err << "\n";
+      return;
+    }
+    std::cout << case_name << "," << m << "," << tile_grid << ","
+              << threads_label << "," << static_cast<long long>(R.wall_ns_min)
+              << "," << static_cast<long long>(R.wall_ns_med) << "," << v
+              << "\n";
+  };
+
+  if (mode == "heap") {
+    auto operands = build_heap(world);
+    auto Rh = time_run(
+        world, [&]() { return run(operands); }, true,
+        repeats);
+    emit("heap", Rh, "single");
+    if (!quiet) std::cerr << "  heap=" << Rh.wall_ns_med / 1e6 << "ms\n";
+  } else {
+    auto operands = build_arena(world);
+    auto Ra = time_run(
+        world, [&]() { return run(operands); }, false,
+        repeats);
+    emit("arena", Ra, "single");
+    if (!quiet) std::cerr << "  arena=" << Ra.wall_ns_med / 1e6 << "ms\n";
+  }
+
+  std::cout.flush();
+  TA::detail::arena_disabled() = false;
+  TA::finalize();
+  return 0;
+}
+
+}
diff --git a/tests/cases/case_hec_e.cpp b/tests/cases/case_hec_e.cpp
new file mode 100644
index 0000000000..83e767e4f2
--- /dev/null
+++ b/tests/cases/case_hec_e.cpp
@@ -0,0 +1,40 @@
+/// hec_e: A(h,i,j;m) * B(h,j,k;n) -> C(h,i,k;m,n); inner outer-product (i, k).
+
+#include "case_common.h"
+
+namespace c = cases;
+
+struct Ops {
+  c::ToT lhs;
+  c::ToT rhs;
+};
+
+int main(int argc, char** argv) {
+  constexpr int N = 30;
+  auto sl = [](long /*h*/, long i, long /*j*/) {
+    return TiledArray::Range{i};
+  };
+  auto sr = [](long /*h*/, long /*j*/, long k) {
+    return TiledArray::Range{k};
+  };
+  return c::run_case_main_split(
+      argc, argv, "hec_e",
+      [&](TiledArray::World& w) {
+        const int H = N * c::g_h_scale();
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged(w, H, N, N, 1.0, sl);
+        ops.rhs = c::make_tot_3d_jagged(w, H, N, N, 100.0, sr);
+        return ops;
+      },
+      [&](TiledArray::World& w) {
+        const int H = N * c::g_h_scale();
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged_slab(w, H, N, N, 1.0, sl);
+        ops.rhs = c::make_tot_3d_jagged_slab(w, H, N, N, 100.0, sr);
+        return ops;
+      },
+      [&](Ops& ops) {
+        return TiledArray::einsum(ops.lhs("h,i,j;m"), ops.rhs("h,j,k;n"),
+                                  "h,i,k;m,n");
+      });
+}
diff --git a/tests/cases/case_hec_ec.cpp b/tests/cases/case_hec_ec.cpp
new file mode 100644
index 0000000000..857c8fcba8
--- /dev/null
+++ b/tests/cases/case_hec_ec.cpp
@@ -0,0 +1,38 @@
+/// hec_ec: A(h,i,j;m,p) * B(h,j,k;p,n) -> C(h,i,k;m,n); inner contracts p.
+
+#include "case_common.h"
+
+namespace c = cases;
+
+struct Ops {
+  c::ToT lhs;
+  c::ToT rhs;
+};
+
+int main(int argc, char** argv) {
+  constexpr int N = 60;
+  auto sl = [](long /*h*/, long i, long j) {
+    return TiledArray::Range{i, j};
+  };
+  auto sr = [](long /*h*/, long j, long k) {
+    return TiledArray::Range{j, k};
+  };
+  return c::run_case_main_split(
+      argc, argv, "hec_ec",
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged(w, N, N, N, 1.0, sl);
+        ops.rhs = c::make_tot_3d_jagged(w, N, N, N, 100.0, sr);
+        return ops;
+      },
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged_slab(w, N, N, N, 1.0, sl);
+        ops.rhs = c::make_tot_3d_jagged_slab(w, N, N, N, 100.0, sr);
+        return ops;
+      },
+      [&](Ops& ops) {
+        return TiledArray::einsum(ops.lhs("h,i,j;m,p"), ops.rhs("h,j,k;p,n"),
+                                  "h,i,k;m,n");
+      });
+}
diff --git a/tests/cases/case_hec_h.cpp b/tests/cases/case_hec_h.cpp
new file mode 100644
index 0000000000..8b4da12071
--- /dev/null
+++ b/tests/cases/case_hec_h.cpp
@@ -0,0 +1,35 @@
+/// hec_h: A(h,i,j;m,n) * B(h,j,k;m,n) -> C(h,i,k;m,n); inner = (h, h).
+
+#include "case_common.h"
+
+namespace c = cases;
+
+struct Ops {
+  c::ToT lhs;
+  c::ToT rhs;
+};
+
+int main(int argc, char** argv) {
+  constexpr int N = 56;
+  auto sf = [](long h, long /*o1*/, long /*o2*/) {
+    return TiledArray::Range{h, h};
+  };
+  return c::run_case_main_split(
+      argc, argv, "hec_h",
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged(w, N, N, N, /*offset=*/1.0, sf);
+        ops.rhs = c::make_tot_3d_jagged(w, N, N, N, /*offset=*/100.0, sf);
+        return ops;
+      },
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged_slab(w, N, N, N, /*offset=*/1.0, sf);
+        ops.rhs = c::make_tot_3d_jagged_slab(w, N, N, N, /*offset=*/100.0, sf);
+        return ops;
+      },
+      [&](Ops& ops) {
+        return TiledArray::einsum(ops.lhs("h,i,j;m,n"), ops.rhs("h,j,k;m,n"),
+                                  "h,i,k;m,n");
+      });
+}
diff --git a/tests/cases/case_hec_scale.cpp b/tests/cases/case_hec_scale.cpp
new file mode 100644
index 0000000000..34d399e323
--- /dev/null
+++ b/tests/cases/case_hec_scale.cpp
@@ -0,0 +1,35 @@
+/// hec_scale: A(h,i,j;m,n) * B_plain(h,j,k) -> C(h,i,k;m,n); inner scale.
+
+#include "case_common.h"
+
+namespace c = cases;
+
+struct Ops {
+  c::ToT lhs;
+  c::Plain rhs;
+};
+
+int main(int argc, char** argv) {
+  constexpr int N = 56;
+  auto sl = [](long /*h*/, long i, long /*j*/) {
+    return TiledArray::Range{i, i};
+  };
+  return c::run_case_main_split(
+      argc, argv, "hec_scale",
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged(w, N, N, N, 1.0, sl);
+        ops.rhs = c::make_plain_3d(w, N, N, N, 0.5);
+        return ops;
+      },
+      [&](TiledArray::World& w) {
+        Ops ops;
+        ops.lhs = c::make_tot_3d_jagged_slab(w, N, N, N, 1.0, sl);
+        ops.rhs = c::make_plain_3d(w, N, N, N, 0.5);
+        return ops;
+      },
+      [&](Ops& ops) {
+        return TiledArray::einsum(ops.lhs("h,i,j;m,n"), ops.rhs("h,j,k"),
+                                  "h,i,k;m,n");
+      });
+}
diff --git a/tests/einsum.cpp b/tests/einsum.cpp
index 6d32285de2..8bae61cf1f 100644
--- a/tests/einsum.cpp
+++ b/tests/einsum.cpp
@@ -237,6 +237,22 @@ BOOST_AUTO_TEST_CASE(equal_nested_ranks) {
                                             {{0, 2}, {0, 3}, {0, 2}},  //
                                             {3},                       //
                                             {2}));
+
+  // H+C;C with permuted inner operands -- no outer permutation, so this
+  // exercises the regime-A arena path; manual_eval is the independent oracle.
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ijk;om,ijk;on->ij;nm",    //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {3, 2},                    //
+                                            {3, 2}));
+
+  // H+C;H with a permuted inner Hadamard operand -- no outer permutation, so
+  // this exercises the regime-A arena Hadamard path against manual_eval.
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ijk;mn,ijk;nm->ij;mn",    //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {4, 3},                    //
+                                            {3, 4}));
   // H+C;H+C not supported
 
   // H;C(op)
diff --git a/tests/tot_construction.cpp b/tests/tot_construction.cpp
new file mode 100644
index 0000000000..bfa3a9ee75
--- /dev/null
+++ b/tests/tot_construction.cpp
@@ -0,0 +1,728 @@
+/// Unified tensor-of-tensors construction: detail::make_nested_tile,
+/// DistArray::init_tiles_nested, and the DistArray ToT range_fn constructor --
+/// exercised identically for TA::Tensor and ArenaTensor inner tiles.
+
+#include "TiledArray/einsum/tiledarray.h"
+#include "TiledArray/tensor/arena_kernels.h"
+#include "TiledArray/tensor/arena_tensor.h"
+#include "tiledarray.h"
+
+#include "global_fixture.h"
+#include "unit_test_config.h"
+
+#include <cstddef>
+#include <vector>
+
+namespace {
+
+namespace TA = TiledArray;
+
+/// Deliberately non-uniform inner extent keyed on the outer element index.
+inline long inner_extent(long e) { return 2 + (e % 3); }
+
+/// Build a rank-1 inner range of the inner tile's range type.
+template <typename InnerTile, typename Index>
+auto inner_range_for(const Index& idx) {
+  return
+      typename InnerTile::range_type{inner_extent(static_cast<long>(idx[0]))};
+}
+
+/// Build a rank-2 (d0 x d1) inner range of the inner tile's range type.
+/// Works for both TA::Range (TA::Tensor inner) and btas::zb::RangeNd
+/// (ArenaTensor inner), which are both constructible from an extent vector.
+template <typename InnerTile>
+auto inner_range_2d(std::size_t d0, std::size_t d1) {
+  return typename InnerTile::range_type(std::vector<std::size_t>{d0, d1});
+}
+
+template <typename InnerTile>
+void verify_cell(const InnerTile& cell, long e, bool expect_filled) {
+  BOOST_REQUIRE(!cell.empty());
+  BOOST_CHECK_EQUAL(static_cast<long>(cell.size()), inner_extent(e));
+  for (std::size_t i = 0; i < cell.size(); ++i) {
+    const double expect = expect_filled ? (100.0 * e + i) : 0.0;
+    BOOST_CHECK_EQUAL(cell.data()[i], expect);
+  }
+}
+
+/// Fill an inner cell so element i of outer element e holds 100*e + i.
+template <typename Cell, typename Index>
+void fill_cell(Cell& cell, const Index& idx) {
+  const long e = static_cast<long>(idx[0]);
+  for (std::size_t i = 0; i < cell.size(); ++i) cell.data()[i] = 100.0 * e + i;
+}
+
+template <typename InnerTile>
+void test_make_nested_tile() {
+  using OuterTile = TA::Tensor<InnerTile>;
+  const TA::Range outer{4};
+  OuterTile tile = TA::detail::make_nested_tile<OuterTile>(
+      outer, [](const auto& idx) { return inner_range_for<InnerTile>(idx); },
+      [](auto& cell, const auto& idx) { fill_cell(cell, idx); });
+  BOOST_REQUIRE_EQUAL(tile.range().volume(), 4u);
+  for (long e = 0; e < 4; ++e)
+    verify_cell(tile.data()[e], e, /*expect_filled=*/true);
+}
+
+template <typename InnerTile, typename Policy>
+void test_dist_array_tot_ctor() {
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, Policy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  // ToT range_fn ctor: shapes every inner cell, storage zero-initialized.
+  Array a(world, trange,
+          [](const auto& idx) { return inner_range_for<InnerTile>(idx); });
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    auto tile = a.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const long e = static_cast<long>(tile.range().idx(ord)[0]);
+      verify_cell(tile.data()[ord], e, /*expect_filled=*/false);
+    }
+  }
+}
+
+template <typename InnerTile, typename Policy>
+void test_init_tiles_nested() {
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, Policy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  Array a(world, trange);
+  a.init_tiles_nested(
+      [](const auto& idx) { return inner_range_for<InnerTile>(idx); },
+      [](auto& cell, const auto& idx) { fill_cell(cell, idx); });
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    auto tile = a.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const long e = static_cast<long>(tile.range().idx(ord)[0]);
+      verify_cell(tile.data()[ord], e, /*expect_filled=*/true);
+    }
+  }
+}
+
+/// fill_random on an already-shaped ToT array is an in-place scalar mutator:
+/// it overwrites every inner scalar while leaving the inner ranges intact.
+template <typename InnerTile, typename Policy>
+void test_fill_random() {
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, Policy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  Array a(world, trange,
+          [](const auto& idx) { return inner_range_for<InnerTile>(idx); });
+  a.fill_random();
+  double sum = 0.0;
+  std::size_t ncells = 0;
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    auto tile = a.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const long e = static_cast<long>(tile.range().idx(ord)[0]);
+      const auto& cell = tile.data()[ord];
+      // inner ranges must survive the in-place fill
+      BOOST_REQUIRE(!cell.empty());
+      BOOST_CHECK_EQUAL(static_cast<long>(cell.size()), inner_extent(e));
+      for (std::size_t i = 0; i < cell.size(); ++i) sum += cell.data()[i];
+      ++ncells;
+    }
+  }
+  BOOST_REQUIRE_GT(ncells, 0u);
+  // a random fill leaving every scalar exactly 0 is a measure-zero event
+  BOOST_CHECK_NE(sum, 0.0);
+}
+
+/// init_elements drives the ToT constructor with an op that yields freestanding
+/// owning inner tensors; for arena inners each outer tile collects the op
+/// outputs, sizes one slab to fit, and deep-copies into the bound cells.
+template <typename InnerTile, typename Policy>
+void test_init_elements() {
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, Policy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  Array a(world, trange);
+  a.init_elements([](const auto& idx) {
+    const long e = static_cast<long>(idx[0]);
+    TA::Tensor<double> t{TA::Range(inner_extent(e))};
+    for (std::size_t i = 0; i < t.size(); ++i) t.data()[i] = 100.0 * e + i;
+    return t;
+  });
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    auto tile = a.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const long e = static_cast<long>(tile.range().idx(ord)[0]);
+      verify_cell(tile.data()[ord], e, /*expect_filled=*/true);
+    }
+  }
+}
+
+/// fill on an already-shaped (uniform-extent) arena ToT deep-copies a
+/// freestanding owning tensor into every bound inner cell.
+void test_fill_arena() {
+  using InnerTile = TA::ArenaTensor<double>;
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, TA::DensePolicy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  const long ext = 3;
+  Array a(world, trange,
+          [ext](const auto&) { return typename InnerTile::range_type{ext}; });
+  TA::Tensor<double> value{TA::Range(ext)};
+  for (std::size_t i = 0; i < value.size(); ++i) value.data()[i] = 7.0 + i;
+  a.fill(value);
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    auto tile = a.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const auto& cell = tile.data()[ord];
+      BOOST_REQUIRE(!cell.empty());
+      BOOST_CHECK_EQUAL(static_cast<long>(cell.size()), ext);
+      for (std::size_t i = 0; i < cell.size(); ++i)
+        BOOST_CHECK_EQUAL(cell.data()[i], 7.0 + i);
+    }
+  }
+}
+
+/// set(i, value) populates a tile of an unshaped arena ToT array: every inner
+/// cell is sized to `value`'s range and deep-copies its data.
+void test_set_value_arena() {
+  using InnerTile = TA::ArenaTensor<double>;
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, TA::DensePolicy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  const long ext = 3;
+  // harvest a populated inner cell from a shaped source array
+  Array src(world, trange,
+            [ext](const auto&) { return typename InnerTile::range_type{ext}; });
+  InnerTile value;
+  for (const auto& tidx : src.trange().tiles_range()) {
+    if (!src.is_local(tidx)) continue;
+    auto tile = src.find(tidx).get();
+    value = tile.data()[0];  // null -> rebind: view src's cell 0
+    for (std::size_t i = 0; i < value.size(); ++i) value.data()[i] = 10.0 + i;
+    break;
+  }
+  BOOST_REQUIRE(!value.empty());
+  // populate a fresh, unshaped array tile-by-tile
+  Array a(world, trange);
+  for (const auto& tidx : a.trange().tiles_range())
+    if (a.is_local(tidx)) a.set(tidx, value);
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    auto tile = a.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const auto& cell = tile.data()[ord];
+      BOOST_REQUIRE(!cell.empty());
+      BOOST_CHECK_EQUAL(static_cast<long>(cell.size()), ext);
+      for (std::size_t i = 0; i < cell.size(); ++i)
+        BOOST_CHECK_EQUAL(cell.data()[i], 10.0 + i);
+    }
+  }
+}
+
+/// set(i, InIter) populates a tile from a sequence of freestanding owning
+/// inner tensors; the slab is sized from their (possibly non-uniform) ranges.
+void test_set_iter_arena() {
+  using InnerTile = TA::ArenaTensor<double>;
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, TA::DensePolicy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  Array a(world, trange);
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    const auto tr = a.trange().make_tile_range(tidx);
+    std::vector<TA::Tensor<double>> cells;
+    for (std::size_t ord = 0; ord < tr.volume(); ++ord) {
+      const long e = static_cast<long>(tr.idx(ord)[0]);
+      TA::Tensor<double> c{TA::Range(inner_extent(e))};
+      for (std::size_t i = 0; i < c.size(); ++i) c.data()[i] = 100.0 * e + i;
+      cells.push_back(c);
+    }
+    a.set(tidx, cells.begin());
+  }
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) continue;
+    auto tile = a.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const long e = static_cast<long>(tile.range().idx(ord)[0]);
+      verify_cell(tile.data()[ord], e, /*expect_filled=*/true);
+    }
+  }
+}
+
+/// Distributed: fetching an arena ToT tile owned by another rank transports
+/// it via madness::archive, exercising Tensor<ArenaTensor>'s arena-aware
+/// serialization end-to-end (slab marshalled out, rebuilt on the receiver).
+void test_distributed_arena_tot() {
+  using InnerTile = TA::ArenaTensor<double>;
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, TA::DensePolicy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4, 6, 8, 10, 12, 14}};  // 7 outer tiles
+  Array a(world, trange);
+  a.init_tiles_nested(
+      [](const auto& idx) { return inner_range_for<InnerTile>(idx); },
+      [](auto& cell, const auto& idx) { fill_cell(cell, idx); });
+  world.gop.fence();
+  std::size_t nremote = 0;
+  for (const auto& tidx : a.trange().tiles_range()) {
+    if (!a.is_local(tidx)) ++nremote;
+    auto tile = a.find(tidx).get();  // remote tile -> serialized transfer
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const long e = static_cast<long>(tile.range().idx(ord)[0]);
+      verify_cell(tile.data()[ord], e, /*expect_filled=*/true);
+    }
+  }
+  // with >1 rank at least one tile must have been fetched (transported) here
+  if (world.size() > 1) BOOST_CHECK_GT(nremote, 0u);
+  world.gop.fence();
+}
+
+/// DistArray-level expression on a tensor-of-tensors, for plain and arena
+/// inner tiles. `fill_a`/`fill_b` populate operands a/b; `expr(c, a, b)`
+/// evaluates the expression under test; `expected(e, i)` is the reference
+/// value for element i of outer element e.
+template <typename InnerTile, typename Policy, typename FillA, typename FillB,
+          typename Expr, typename Expected>
+void run_tot_expr(FillA fill_a, FillB fill_b, Expr expr, Expected expected) {
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, Policy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}};
+  Array a(world, trange), b(world, trange);
+  auto range_fn = [](const auto& idx) {
+    return inner_range_for<InnerTile>(idx);
+  };
+  a.init_tiles_nested(range_fn, fill_a);
+  b.init_tiles_nested(range_fn, fill_b);
+  world.gop.fence();
+  Array c;
+  expr(c, a, b);
+  world.gop.fence();
+  for (const auto& tidx : c.trange().tiles_range()) {
+    if (!c.is_local(tidx)) continue;
+    auto tile = c.find(tidx).get();
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const long e = static_cast<long>(tile.range().idx(ord)[0]);
+      const auto& cell = tile.data()[ord];
+      BOOST_REQUIRE(!cell.empty());
+      BOOST_CHECK_EQUAL(static_cast<long>(cell.size()), inner_extent(e));
+      for (std::size_t i = 0; i < cell.size(); ++i)
+        BOOST_CHECK_EQUAL(cell.data()[i], expected(e, static_cast<long>(i)));
+    }
+  }
+}
+
+/// c = a + b, element-wise over matching inner cells.
+template <typename InnerTile, typename Policy>
+void test_tot_add() {
+  auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); };
+  run_tot_expr<InnerTile, Policy>(
+      fill, fill,
+      [](auto& c, auto& a, auto& b) { c("i;j") = a("i;j") + b("i;j"); },
+      [](long e, long i) { return 2.0 * (100.0 * e + i); });
+}
+
+/// c = a - b, element-wise over matching inner cells.
+template <typename InnerTile, typename Policy>
+void test_tot_subt() {
+  auto fill_a = [](auto& cell, const auto& idx) {
+    const long e = static_cast<long>(idx[0]);
+    for (std::size_t i = 0; i < cell.size(); ++i)
+      cell.data()[i] = 300.0 * e + 2.0 * i;
+  };
+  auto fill_b = [](auto& cell, const auto& idx) { fill_cell(cell, idx); };
+  run_tot_expr<InnerTile, Policy>(
+      fill_a, fill_b,
+      [](auto& c, auto& a, auto& b) { c("i;j") = a("i;j") - b("i;j"); },
+      [](long e, long i) { return 200.0 * e + i; });
+}
+
+/// c = a * b, full Hadamard (outer and inner) over matching inner cells.
+template <typename InnerTile, typename Policy>
+void test_tot_mult() {
+  auto fill_a = [](auto& cell, const auto& idx) {
+    const long e = static_cast<long>(idx[0]);
+    for (std::size_t i = 0; i < cell.size(); ++i)
+      cell.data()[i] = static_cast<double>(e + static_cast<long>(i) + 1);
+  };
+  auto fill_b = [](auto& cell, const auto&) {
+    for (std::size_t i = 0; i < cell.size(); ++i) cell.data()[i] = 3.0;
+  };
+  run_tot_expr<InnerTile, Policy>(
+      fill_a, fill_b,
+      [](auto& c, auto& a, auto& b) { c("i;j") = a("i;j") * b("i;j"); },
+      [](long e, long i) { return 3.0 * (e + i + 1); });
+}
+
+/// c = 3 * a, scalar scaling over inner cells.
+template <typename InnerTile, typename Policy>
+void test_tot_scale() {
+  auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); };
+  run_tot_expr<InnerTile, Policy>(
+      fill, fill, [](auto& c, auto& a, auto&) { c("i;j") = 3.0 * a("i;j"); },
+      [](long e, long i) { return 3.0 * (100.0 * e + i); });
+}
+
+/// c = 3 * (a + b); exercises the scaled-add tile op (add with a factor).
+template <typename InnerTile, typename Policy>
+void test_tot_scaled_add() {
+  auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); };
+  run_tot_expr<InnerTile, Policy>(
+      fill, fill,
+      [](auto& c, auto& a, auto& b) { c("i;j") = 3.0 * (a("i;j") + b("i;j")); },
+      [](long e, long i) { return 6.0 * (100.0 * e + i); });
+}
+
+/// c = 3 * (a - b); exercises the scaled-subt tile op (subt with a factor).
+template <typename InnerTile, typename Policy>
+void test_tot_scaled_subt() {
+  auto fill_a = [](auto& cell, const auto& idx) {
+    const long e = static_cast<long>(idx[0]);
+    for (std::size_t i = 0; i < cell.size(); ++i)
+      cell.data()[i] = 300.0 * e + 2.0 * i;
+  };
+  auto fill_b = [](auto& cell, const auto& idx) { fill_cell(cell, idx); };
+  run_tot_expr<InnerTile, Policy>(
+      fill_a, fill_b,
+      [](auto& c, auto& a, auto& b) { c("i;j") = 3.0 * (a("i;j") - b("i;j")); },
+      [](long e, long i) { return 3.0 * (200.0 * e + i); });
+}
+
+/// c = -a, negation over inner cells.
+template <typename InnerTile, typename Policy>
+void test_tot_neg() {
+  auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); };
+  run_tot_expr<InnerTile, Policy>(
+      fill, fill, [](auto& c, auto& a, auto&) { c("i;j") = -a("i;j"); },
+      [](long e, long i) { return -(100.0 * e + i); });
+}
+
+/// End-to-end ToT contraction through TA::einsum: outer Hadamard over i,j
+/// with an outer contraction over k, plus an inner contraction. This routes
+/// through the regime-A arena einsum path (the outer-Hadamard "hadamard
+/// reduction" branch), not the expression-DSL delegation a pure-Hadamard
+/// outer would take. `annot` is the einsum string; a's inner cells are
+/// `a0 x a1`, b's are `b0 x b1`. A non-canonical inner annotation exercises
+/// the inner-permutation hoist. The arena-inner result is checked against a
+/// Tensor<Tensor<double>> reference run of the identical expression.
+template <typename InnerTile, typename Policy>
+void test_tot_einsum_contraction(const char* annot, std::size_t a0,
+                                 std::size_t a1, std::size_t b0,
+                                 std::size_t b1) {
+  using Array = TA::DistArray<TA::Tensor<InnerTile>, Policy>;
+  using RefArray = TA::DistArray<TA::Tensor<TA::Tensor<double>>, Policy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}, {0, 2, 4}, {0, 2}};
+
+  auto fill_a = [](auto& cell, const auto& idx) {
+    const long key = 7 * static_cast<long>(idx[0]) +
+                     13 * static_cast<long>(idx[1]) +
+                     31 * static_cast<long>(idx[2]);
+    for (std::size_t p = 0; p < cell.size(); ++p)
+      cell.data()[p] = static_cast<double>(1 + static_cast<long>(p) + key);
+  };
+  auto fill_b = [](auto& cell, const auto& idx) {
+    const long key = 5 * static_cast<long>(idx[0]) +
+                     3 * static_cast<long>(idx[1]) +
+                     11 * static_cast<long>(idx[2]);
+    for (std::size_t p = 0; p < cell.size(); ++p)
+      cell.data()[p] = static_cast<double>(2 + static_cast<long>(p) + key);
+  };
+
+  Array a(world, trange), b(world, trange);
+  a.init_tiles_nested(
+      [a0, a1](const auto&) { return inner_range_2d<InnerTile>(a0, a1); },
+      fill_a);
+  b.init_tiles_nested(
+      [b0, b1](const auto&) { return inner_range_2d<InnerTile>(b0, b1); },
+      fill_b);
+  RefArray a_ref(world, trange), b_ref(world, trange);
+  a_ref.init_tiles_nested(
+      [a0, a1](const auto&) {
+        return inner_range_2d<TA::Tensor<double>>(a0, a1);
+      },
+      fill_a);
+  b_ref.init_tiles_nested(
+      [b0, b1](const auto&) {
+        return inner_range_2d<TA::Tensor<double>>(b0, b1);
+      },
+      fill_b);
+  world.gop.fence();
+
+  auto c = TA::einsum(annot, a, b);
+  auto c_ref = TA::einsum(annot, a_ref, b_ref);
+  world.gop.fence();
+
+  for (const auto& tidx : c.trange().tiles_range()) {
+    if (!c.is_local(tidx)) continue;
+    auto tile = c.find(tidx).get();
+    auto ref_tile = c_ref.find(tidx).get();
+    BOOST_REQUIRE_EQUAL(tile.range().volume(), ref_tile.range().volume());
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const auto& cell = tile.data()[ord];
+      const auto& ref_cell = ref_tile.data()[ord];
+      BOOST_REQUIRE(!cell.empty());
+      BOOST_REQUIRE_EQUAL(cell.size(), ref_cell.size());
+      for (std::size_t p = 0; p < cell.size(); ++p)
+        BOOST_CHECK_EQUAL(cell.data()[p], ref_cell.data()[p]);
+    }
+  }
+}
+
+/// End-to-end T x ToT Hadamard through TA::einsum: a plain DistArray scales
+/// each inner cell of a ToT array. A pure-Hadamard outer ("ij,ij;a->ij;a")
+/// makes einsum delegate to the expression DSL, exercising the arena
+/// `t x tot` Mult tile op. The arena-inner result is checked against an
+/// identical Tensor<double>-inner reference run (the legacy `binary` path).
+template <typename InnerTile, typename Policy>
+void test_tot_einsum_t_x_tot() {
+  using ToTArray = TA::DistArray<TA::Tensor<InnerTile>, Policy>;
+  using RefArray = TA::DistArray<TA::Tensor<TA::Tensor<double>>, Policy>;
+  using PlainArray = TA::DistArray<TA::Tensor<double>, Policy>;
+  TA::World& world = *GlobalFixture::world;
+  TA::TiledRange trange{{0, 2, 4}, {0, 2, 4}};
+
+  auto plain_fill = [](const TA::Range& r) {
+    TA::Tensor<double> t(r);
+    for (std::size_t p = 0; p < t.size(); ++p)
+      t.data()[p] = 1.0 + static_cast<double>(p);
+    return t;
+  };
+  auto tot_fill = [](auto& cell, const auto& idx) {
+    const long key =
+        7 * static_cast<long>(idx[0]) + 13 * static_cast<long>(idx[1]);
+    for (std::size_t p = 0; p < cell.size(); ++p)
+      cell.data()[p] = static_cast<double>(2 + static_cast<long>(p) + key);
+  };
+
+  PlainArray a(world, trange);
+  a.init_tiles(plain_fill);
+  ToTArray b(world, trange);
+  b.init_tiles_nested(
+      [](const auto&) {
+        return typename InnerTile::range_type(std::vector<std::size_t>{4});
+      },
+      tot_fill);
+  RefArray b_ref(world, trange);
+  b_ref.init_tiles_nested(
+      [](const auto&) {
+        return TA::Tensor<double>::range_type(std::vector<std::size_t>{4});
+      },
+      tot_fill);
+  world.gop.fence();
+
+  auto c = TA::einsum("ij,ij;a->ij;a", a, b);
+  auto c_ref = TA::einsum("ij,ij;a->ij;a", a, b_ref);
+  world.gop.fence();
+
+  for (const auto& tidx : c.trange().tiles_range()) {
+    if (!c.is_local(tidx)) continue;
+    auto tile = c.find(tidx).get();
+    auto ref_tile = c_ref.find(tidx).get();
+    BOOST_REQUIRE_EQUAL(tile.range().volume(), ref_tile.range().volume());
+    for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) {
+      const auto& cell = tile.data()[ord];
+      const auto& ref_cell = ref_tile.data()[ord];
+      BOOST_REQUIRE(!cell.empty());
+      BOOST_REQUIRE_EQUAL(cell.size(), ref_cell.size());
+      for (std::size_t p = 0; p < cell.size(); ++p)
+        BOOST_CHECK_EQUAL(cell.data()[p], ref_cell.data()[p]);
+    }
+  }
+}
+
+/// Tensor<ArenaTensor>::permute with a bipartite permutation: outer cells
+/// reorder shallowly, inner cells are permuted into a fresh slab. Here both
+/// the outer and inner parts are transposes.
+void test_arena_tile_permute() {
+  using Inner = TA::ArenaTensor<double>;
+  using Outer = TA::Tensor<Inner>;
+  constexpr long OI = 2, OJ = 3, R = 4, C = 5;
+  auto val = [](long oi, long oj, long ii, long ij) {
+    return 1.0 + oi * 1000.0 + oj * 100.0 + ii * 10.0 + ij;
+  };
+  Outer tile = TA::detail::make_nested_tile<Outer>(
+      TA::Range{OI, OJ},
+      [](const auto&) { return inner_range_2d<Inner>(R, C); },
+      [&val](auto& cell, const auto& idx) {
+        const long oi = static_cast<long>(idx[0]);
+        const long oj = static_cast<long>(idx[1]);
+        for (long ii = 0; ii < R; ++ii)
+          for (long ij = 0; ij < C; ++ij)
+            cell.data()[ii * C + ij] = val(oi, oj, ii, ij);
+      });
+
+  // bipartite transpose over the combined index space {0,1 | 2,3}: outer
+  // part transposes dims 0,1 and inner part transposes dims 2,3; the trailing
+  // 2 marks the second (inner) partition size.
+  TA::BipartitePermutation bperm(TA::Permutation{1, 0, 3, 2}, 2);
+  Outer p = tile.permute(bperm);
+
+  // outer range transposed: {OI,OJ} -> {OJ,OI}
+  BOOST_REQUIRE_EQUAL(p.range().extent(0), OJ);
+  BOOST_REQUIRE_EQUAL(p.range().extent(1), OI);
+  for (long oi = 0; oi < OI; ++oi)
+    for (long oj = 0; oj < OJ; ++oj) {
+      // src outer (oi,oj) lands at result outer (oj,oi)
+      const auto& cell = p.data()[oj * OI + oi];
+      BOOST_REQUIRE(!cell.empty());
+      BOOST_REQUIRE_EQUAL(static_cast<long>(cell.size()), R * C);
+      // inner transposed: result cell range {R,C} -> {C,R}
+      BOOST_CHECK_EQUAL(cell.range().extent(0), C);
+      BOOST_CHECK_EQUAL(cell.range().extent(1), R);
+      for (long ii = 0; ii < R; ++ii)
+        for (long ij = 0; ij < C; ++ij)
+          BOOST_CHECK_EQUAL(cell.data()[ij * R + ii], val(oi, oj, ii, ij));
+    }
+}
+
+}  // namespace
+
+BOOST_AUTO_TEST_SUITE(tot_construction_suite, TA_UT_LABEL_SERIAL)
+
+BOOST_AUTO_TEST_CASE(make_nested_tile_tensor_inner) {
+  test_make_nested_tile<TA::Tensor<double>>();
+}
+
+BOOST_AUTO_TEST_CASE(make_nested_tile_arena_inner) {
+  test_make_nested_tile<TA::ArenaTensor<double>>();
+}
+
+BOOST_AUTO_TEST_CASE(dist_array_tot_ctor_tensor_inner) {
+  test_dist_array_tot_ctor<TA::Tensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(dist_array_tot_ctor_arena_inner) {
+  test_dist_array_tot_ctor<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(init_tiles_nested_tensor_inner) {
+  test_init_tiles_nested<TA::Tensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(init_tiles_nested_arena_inner) {
+  test_init_tiles_nested<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(fill_random_tensor_inner) {
+  test_fill_random<TA::Tensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(fill_random_arena_inner) {
+  test_fill_random<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(init_elements_tensor_inner) {
+  test_init_elements<TA::Tensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(init_elements_arena_inner) {
+  test_init_elements<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(fill_arena_inner) { test_fill_arena(); }
+
+BOOST_AUTO_TEST_CASE(set_value_arena_inner) { test_set_value_arena(); }
+
+BOOST_AUTO_TEST_CASE(set_iter_arena_inner) { test_set_iter_arena(); }
+
+BOOST_AUTO_TEST_CASE(add_tensor_inner) {
+  test_tot_add<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(add_arena_inner) {
+  test_tot_add<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(subt_tensor_inner) {
+  test_tot_subt<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(subt_arena_inner) {
+  test_tot_subt<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(mult_tensor_inner) {
+  test_tot_mult<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(mult_arena_inner) {
+  test_tot_mult<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(scaled_add_tensor_inner) {
+  test_tot_scaled_add<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(scaled_add_arena_inner) {
+  test_tot_scaled_add<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(scaled_subt_tensor_inner) {
+  test_tot_scaled_subt<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(scaled_subt_arena_inner) {
+  test_tot_scaled_subt<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(scale_tensor_inner) {
+  test_tot_scale<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(scale_arena_inner) {
+  test_tot_scale<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(neg_tensor_inner) {
+  test_tot_neg<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(neg_arena_inner) {
+  test_tot_neg<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+// canonical inner contraction: c(ij;mn) = sum_k sum_o a(ijk;mo) b(ijk;on)
+BOOST_AUTO_TEST_CASE(einsum_contraction_tensor_inner) {
+  test_tot_einsum_contraction<TA::Tensor<double>, TA::DensePolicy>(
+      "ijk;mo,ijk;on->ij;mn", 2, 3, 3, 2);
+}
+BOOST_AUTO_TEST_CASE(einsum_contraction_arena_inner) {
+  test_tot_einsum_contraction<TA::ArenaTensor<double>, TA::DensePolicy>(
+      "ijk;mo,ijk;on->ij;mn", 2, 3, 3, 2);
+}
+
+// non-canonical inner annotations: operand A reordered (o,m) and the result
+// reordered (n,m) -- exercises the regime-A inner-permutation hoist.
+BOOST_AUTO_TEST_CASE(einsum_contraction_perm_tensor_inner) {
+  test_tot_einsum_contraction<TA::Tensor<double>, TA::DensePolicy>(
+      "ijk;om,ijk;on->ij;nm", 3, 2, 3, 2);
+}
+BOOST_AUTO_TEST_CASE(einsum_contraction_perm_arena_inner) {
+  test_tot_einsum_contraction<TA::ArenaTensor<double>, TA::DensePolicy>(
+      "ijk;om,ijk;on->ij;nm", 3, 2, 3, 2);
+}
+
+// inner Hadamard with a permuted operand: c(ij;mn) = sum_k a(ijk;mn) b(ijk;nm)
+BOOST_AUTO_TEST_CASE(einsum_hadamard_perm_tensor_inner) {
+  test_tot_einsum_contraction<TA::Tensor<double>, TA::DensePolicy>(
+      "ijk;mn,ijk;nm->ij;mn", 2, 3, 3, 2);
+}
+BOOST_AUTO_TEST_CASE(einsum_hadamard_perm_arena_inner) {
+  test_tot_einsum_contraction<TA::ArenaTensor<double>, TA::DensePolicy>(
+      "ijk;mn,ijk;nm->ij;mn", 2, 3, 3, 2);
+}
+
+// plain T x ToT Hadamard: c(ij;a) = a(ij) * b(ij;a), routed through the
+// expression-DSL Mult tile op.
+BOOST_AUTO_TEST_CASE(einsum_t_x_tot_tensor_inner) {
+  test_tot_einsum_t_x_tot<TA::Tensor<double>, TA::DensePolicy>();
+}
+BOOST_AUTO_TEST_CASE(einsum_t_x_tot_arena_inner) {
+  test_tot_einsum_t_x_tot<TA::ArenaTensor<double>, TA::DensePolicy>();
+}
+
+BOOST_AUTO_TEST_CASE(arena_tile_bipartite_permute) {
+  test_arena_tile_permute();
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(tot_construction_dist_suite, TA_UT_LABEL_DISTRIBUTED)
+
+BOOST_AUTO_TEST_CASE(arena_tot_remote_tile_transport) {
+  test_distributed_arena_tot();
+}
+
+BOOST_AUTO_TEST_SUITE_END()