diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index bacd58fb0e..3b745fdd28 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -27,11 +27,14 @@ #define TILEDARRAY_ARRAY_IMPL_H__INCLUDED #include +#include #include #include #include #include +#include + namespace TiledArray { namespace detail { @@ -986,48 +989,104 @@ std::shared_ptr> make_with_new_trange( Policy::default_pmap(world, target_trange.tiles_range().volume())), Array::lazy_deleter); auto& target_array = *target_array_sptr; - target_array.init_tiles([value = new_value_fill](const Range& range) { - return typename Array::value_type(range, value); - }); - target_array.world().gop.fence(); - - // loop over local tile and sends its contributions to the targets - { - const auto e = source_array.cend(); - auto& target_tiles_range = target_trange.tiles_range(); - for (auto it = source_array.cbegin(); it != e; ++it) { - const auto& source_tile = *it; - auto source_tile_idx = it.index(); - // make range for iterating over all possible target tile idx combinations - TA::Index target_tile_ord_extent_range(rank); - for (auto d = 0; d != rank; ++d) { - target_tile_ord_extent_range[d] = - all_target_tiles[d][source_tile_idx[d]].size(); + if constexpr (detail::is_tensor_of_tensor_v && + is_arena_tensor_v) { + // Arena tensor-of-tensor: a ToT tile's inner cells are non-owning views + // into that tile's own arena slab. The generic null-init + write_tile_block + // scatter (the `else` branch) would rebind the target's null inner cells to + // the *source* tiles' slabs, leaving them dangling once the source array is + // destroyed. Instead build each local target tile directly (deep copy) by + // pulling the source cells: a retile preserves the element space, so the + // target cell at global outer element `e` takes its inner range and data + // from the source cell at `e` (elements outside the source range, e.g. a + // retile that grows the element range, yield null cells). + using inner_range_type = typename Tile::value_type::range_type; + const auto& source_elements = source_array.trange().elements_range(); + std::map src_tile_cache; + auto source_cell_at = + [&](const auto& e) -> const typename Tile::value_type* { + if (!source_elements.includes(e)) return nullptr; + const auto src_tile_idx = source_array.trange().element_to_tile(e); + const auto src_ord = + source_array.trange().tiles_range().ordinal(src_tile_idx); + auto it = src_tile_cache.find(src_ord); + if (it == src_tile_cache.end()) { + it = src_tile_cache + .emplace(src_ord, source_array.is_zero(src_tile_idx) + ? Tile{} + : source_array.get(src_tile_idx).get()) + .first; } - - // loop over every target tile combination - TA::Range target_tile_ord_extent(target_tile_ord_extent_range); - for (auto& target_tile_ord : target_tile_ord_extent) { - TA::Index target_tile_idx(rank); - container::svector target_tile_rngs1(rank); + const Tile& st = it->second; + if (st.empty()) return nullptr; + return &st(e); + }; + for (const auto target_ord : *target_array.pmap()) { + if (target_array.is_zero(target_ord)) continue; + Tile tile = make_nested_tile( + target_trange.make_tile_range(target_ord), + [&](const auto& e) -> inner_range_type { + const auto* sc = source_cell_at(e); + return (sc && !sc->empty()) ? sc->range() : inner_range_type{}; + }, + [&](auto& cell, const auto& e) { + const auto* sc = source_cell_at(e); + if (sc && !sc->empty()) { + const auto* s = sc->data(); + auto* d = cell.data(); + for (std::size_t p = 0; p < cell.size(); ++p) d[p] = s[p]; + } + }); + target_array.set(target_ord, std::move(tile)); + } + target_array.world().gop.fence(); + } else { + target_array.init_tiles([value = new_value_fill](const Range& range) { + return typename Array::value_type(range, value); + }); + target_array.world().gop.fence(); + + // loop over local tile and sends its contributions to the targets + { + const auto e = source_array.cend(); + auto& target_tiles_range = target_trange.tiles_range(); + for (auto it = source_array.cbegin(); it != e; ++it) { + const auto& source_tile = *it; + auto source_tile_idx = it.index(); + + // make range for iterating over all possible target tile idx + // combinations + TA::Index target_tile_ord_extent_range(rank); for (auto d = 0; d != rank; ++d) { - std::tie(target_tile_idx[d], target_tile_rngs1[d]) = - all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]]; + target_tile_ord_extent_range[d] = + all_target_tiles[d][source_tile_idx[d]].size(); + } + + // loop over every target tile combination + TA::Range target_tile_ord_extent(target_tile_ord_extent_range); + for (auto& target_tile_ord : target_tile_ord_extent) { + TA::Index target_tile_idx(rank); + container::svector target_tile_rngs1(rank); + for (auto d = 0; d != rank; ++d) { + std::tie(target_tile_idx[d], target_tile_rngs1[d]) = + all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]]; + } + TA_ASSERT(source_tile.future().probe()); + Tile target_tile_contribution( + source_tile.get().block(target_tile_rngs1)); + auto target_tile_idx_ord = + target_tiles_range.ordinal(target_tile_idx); + auto target_proc = target_array.pmap()->owner(target_tile_idx_ord); + world.taskq.add(target_proc, &write_tile_block, + target_array.id(), target_tile_idx_ord, + target_tile_contribution); } - TA_ASSERT(source_tile.future().probe()); - Tile target_tile_contribution( - source_tile.get().block(target_tile_rngs1)); - auto target_tile_idx_ord = target_tiles_range.ordinal(target_tile_idx); - auto target_proc = target_array.pmap()->owner(target_tile_idx_ord); - world.taskq.add(target_proc, &write_tile_block, - target_array.id(), target_tile_idx_ord, - target_tile_contribution); } } + // data is mutated in place, so must wait for all tasks to complete + target_array.world().gop.fence(); } - // data is mutated in place, so must wait for all tasks to complete - target_array.world().gop.fence(); // WARNING!! need to truncate in DistArray ctor return target_array_sptr; diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index c3cafb5605..6b994e17c0 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -28,6 +28,7 @@ #include "TiledArray/pmap/replicated_pmap.h" #include "TiledArray/policies/dense_policy.h" #include "TiledArray/replicator.h" +#include "TiledArray/tensor/arena_kernels.h" #include "TiledArray/tile_interface/cast.h" #include "TiledArray/util/annotation.h" #include "TiledArray/util/initializer_list.h" @@ -253,6 +254,35 @@ class DistArray : public madness::archive::ParallelSerializableObject { std::shared_ptr()) : pimpl_(init(get_default_world(), trange, shape, pmap)) {} + /// Tensor-of-tensors array constructor + + /// Constructs a tensor-of-tensors array in fully-shaped state: every inner + /// cell of every local tile is allocated (its range taken from + /// \p inner_range_fn, element storage zero-initialized), so the array + /// immediately satisfies the ToT validity invariant and is ready for + /// in-place fill (\c fill, \c foreach_inplace, element writes, ...). + /// Enabled only when \c Tile is a tensor-of-tensors. + /// \tparam InnerRangeFn callable type + /// \param world The world where the array will live. + /// \param trange The tiled range object that defines the array tiling. + /// \param inner_range_fn callable mapping a global outer element index to + /// that inner cell's range; a zero-volume range yields a null cell. + /// \param pmap The tile index -> process map + template < + typename InnerRangeFn, + typename = std::enable_if_t< + detail::is_tensor_of_tensor_v && + !std::is_convertible_v, shape_type> && + !std::is_convertible_v, + std::shared_ptr>>> + DistArray(World& world, const trange_type& trange, + InnerRangeFn&& inner_range_fn, + const std::shared_ptr& pmap = {}) + : DistArray(world, trange, pmap) { + init_tiles_nested(std::forward(inner_range_fn), + detail::nested_fill_noop{}); + } + /// \name Initializer list constructors /// \brief Creates a new tensor containing the elements in the provided /// `std::initializer_list`. @@ -779,7 +809,23 @@ class DistArray : public madness::archive::ParallelSerializableObject { detail::is_input_iterator::value>> void set(const Index& i, InIter first) { check_index(i); - pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), first)); + if constexpr (detail::is_tensor_of_tensor_v && + is_arena_tensor_v) { + // arena ToT: the iterated inner tiles carry the ranges needed to size + // the slab; buffer them (the iterator is single-pass) and build. + const auto outer_range = pimpl_->trange().make_tile_range(i); + using SrcTile = std::decay_t; + std::vector buf; + buf.reserve(outer_range.volume()); + for (std::size_t k = 0; k < outer_range.volume(); ++k, ++first) + buf.emplace_back(*first); + pimpl_->set(i, make_arena_nested_tile( + outer_range, [&buf](std::size_t k) -> const SrcTile& { + return buf[k]; + })); + } else { + pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), first)); + } } /// Set a tile and fill it using a sequence @@ -828,7 +874,20 @@ class DistArray : public madness::archive::ParallelSerializableObject { typename = enable_if_is_integral_or_integral_range> void set(const Index& i, const element_type& value = element_type()) { check_index(i); - pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), value)); + if constexpr (detail::is_tensor_of_tensor_v && + is_arena_tensor_v) { + // arena ToT: every inner cell takes `value`'s (initialized) range and a + // deep copy of its data -- build the slab-backed tile from that range. + TA_ASSERT(!value.empty() && + "DistArray::set: a null inner tile has no range to size cells"); + pimpl_->set(i, make_arena_nested_tile( + pimpl_->trange().make_tile_range(i), + [&value](std::size_t) -> const element_type& { + return value; + })); + } else { + pimpl_->set(i, value_type(pimpl_->trange().make_tile_range(i), value)); + } } /// Set every element of a tile to a specified value @@ -908,12 +967,33 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already set. Weak throw guarantee. - template - std::int64_t fill_local(const element_type& value = element_type(), - bool skip_set = false) { - return init_tiles( - [value](const range_type& range) { return value_type(range, value); }, - skip_set); + /// + /// \tparam V the value type; defaults to \c element_type but may be any + /// type assignable to \c element_type& -- a freestanding + /// \c ArenaTensor cannot be minted, so an arena-backed ToT is + /// filled by passing e.g. an owning \c TA::Tensor. + /// \note For an *arena-backed* tensor-of-tensors tile type this is an + /// in-place mutator over an already-shaped array (constructed via + /// \c init_tiles_nested or the ToT range_fn ctor): \p value is deep-copied + /// into every (bound) inner cell, so it must match each cell's volume. + template >> + std::int64_t fill_local(const V& value = V(), bool skip_set = false) { + if constexpr (detail::is_tensor_of_tensor_v && + is_arena_tensor_v) { + return for_each_local_tile_inplace([value](value_type& outer) { + for (std::size_t o = 0; o < outer.size(); ++o) { + auto& cell = outer.data()[o]; + if (cell.empty()) continue; // skip deliberately-null cells + cell = value; // deep copy into the bound arena cell + } + }); + } else { + return init_tiles( + [value](const range_type& range) { return value_type(range, value); }, + skip_set); + } } /// Fill all local tiles with the specified value @@ -930,11 +1010,16 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already set. Weak throw guarantee. - template - std::int64_t fill(const element_type& value = numeric_type(), - bool skip_set = false) { - // for sparse arrays filled with zero, replace with an empty array - if constexpr (!is_dense_v) { + template >> + std::int64_t fill(const V& value = V(), bool skip_set = false) { + // for sparse arrays filled with zero, replace with an empty array; + // an arena-backed ToT is shaped before fill (fill_local mutates in place), + // and its inner view tiles have no zero-comparison -- skip for those + if constexpr (!is_dense_v && + !(detail::is_tensor_of_tensor_v && + is_arena_tensor_v)) { if (value == element_type()) { *this = DistArray( world(), trange(), @@ -957,7 +1042,9 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \tparam fence If Fence::No, the operation will return early, /// before the tasks have completed /// \tparam T The type of random value to generate. Defaults to - /// element_type. + /// numeric_type (the scalar type), so this works for + /// tensor-of-tensors arrays, where it fills every inner scalar + /// in place over an already-shaped array (see \c init_elements). /// \param[in] skip_set If false, will throw if any tiles are already set /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong @@ -965,11 +1052,25 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already initialized. Weak throw guarantee. template > std::int64_t fill_random(bool skip_set = false) { - return init_elements( - [](const auto&) { return detail::MakeRandom::generate_value(); }); + if constexpr (detail::is_tensor_of_tensor_v) { + // in-place over an already-shaped ToT array (plain or arena-backed): + // overwrite every inner scalar, leaving inner ranges untouched + return for_each_local_tile_inplace([](value_type& outer) { + for (std::size_t o = 0; o < outer.size(); ++o) { + auto& cell = outer.data()[o]; + if (cell.empty()) continue; + const std::size_t n = cell.size(); + for (std::size_t i = 0; i < n; ++i) + cell.data()[i] = detail::MakeRandom::generate_value(); + } + }); + } else { + return init_elements( + [](const auto&) { return detail::MakeRandom::generate_value(); }); + } } /// Initialize (local) tiles with a user provided functor @@ -1042,20 +1143,94 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \throw TiledArray::Exception if skip_set is false and a local, non-zero /// tile is already initialized. Weak throw /// guarantee. + /// + /// \note \p op must return a freestanding value assignable to + /// \c element_type&. For an *arena-backed* tensor-of-tensors tile type the + /// inner cell is a non-owning view that cannot be minted standalone, so + /// \p op returns an owning tensor (e.g. \c TA::Tensor): each outer tile + /// collects its \p op outputs, then allocates one arena slab sized to them + /// (via \c detail::make_nested_tile) and deep-copies the outputs into the + /// bound inner cells. template std::int64_t init_elements(Op&& op, bool skip_set = false) { auto op_shared_handle = make_op_shared_handle(std::forward(op)); - return init_tiles( - [op = std::move(op_shared_handle)]( - const TiledArray::Range& range) -> value_type { - // Initialize the tile with the given range object - Tile tile(range); + if constexpr (detail::is_tensor_of_tensor_v && + is_arena_tensor_v) { + return init_tiles( + [op = std::move(op_shared_handle)]( + const TiledArray::Range& outer_range) -> value_type { + using R = std::decay_t; + static_assert( + std::is_assignable_v, + "DistArray::init_elements: op must return a freestanding " + "tensor assignable to the inner tile type"); + // pass 1: collect op's freestanding inner tensors; pass 2: + // make_arena_nested_tile sizes the slab and deep-copies them in + std::vector collected; + collected.reserve(outer_range.volume()); + for (std::size_t o = 0; o < outer_range.volume(); ++o) + collected.emplace_back(op(outer_range.idx(o))); + return make_arena_nested_tile( + outer_range, [&collected](std::size_t k) -> const R& { + return collected[k]; + }); + }, + skip_set); + } else { + return init_tiles( + [op = std::move(op_shared_handle)]( + const TiledArray::Range& range) -> value_type { + // Initialize the tile with the given range object + Tile tile(range); + + // Initialize tile elements + for (auto& idx : range) tile[idx] = op(idx); + + return tile; + }, + skip_set); + } + } - // Initialize tile elements - for (auto& idx : range) tile[idx] = op(idx); + /// Initialize tensor-of-tensors tiles two-pass with user-provided functors - return tile; + /// A whole-tile constructor (like \c init_tiles), specialized for + /// tensor-of-tensors \c Tile s: each local tile is built via + /// \c detail::make_nested_tile -- \p inner_range_fn sizes every inner cell, + /// \p inner_fill_fn fills it -- so arena-backed inner cells are allocated in + /// one slab per tile. The work is done in parallel, so both functors must be + /// thread safe. The expected signatures are: + /// \code + /// inner_range_type inner_range_fn(const Index& outer_element_index) + /// void inner_fill_fn(inner_tile& cell, const Index& outer_element_index) + /// \endcode + /// where \c outer_element_index is a global element index. A zero-volume + /// inner range yields a deliberately-null inner cell, which \p inner_fill_fn + /// is not invoked on. + /// \tparam InnerRangeFn callable producing each inner cell's range + /// \tparam InnerFillFn callable filling each non-null inner cell + /// \param[in] inner_range_fn maps a global outer element index to an inner + /// range + /// \param[in] inner_fill_fn fills a non-null inner cell + /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized + template >> + std::int64_t init_tiles_nested(InnerRangeFn&& inner_range_fn, + InnerFillFn&& inner_fill_fn, + bool skip_set = false) { + auto range_fn = + make_op_shared_handle(std::forward(inner_range_fn)); + auto fill_fn = + make_op_shared_handle(std::forward(inner_fill_fn)); + return init_tiles( + [range_fn = std::move(range_fn), fill_fn = std::move(fill_fn)]( + const TiledArray::Range& outer_tile_range) -> value_type { + return detail::make_nested_tile(outer_tile_range, + range_fn, fill_fn); }, skip_set); } @@ -1705,6 +1880,78 @@ class DistArray : public madness::archive::ParallelSerializableObject { #endif // NDEBUG } + /// Applies an in-place mutator to every local, non-zero tile. + + /// This is the engine behind the tensor-of-tensors \c fill* / \c + /// init_elements path: the array must already be shaped (every local tile + /// future registered, e.g. by the ToT range_fn constructor or \c + /// init_tiles_nested), and \p tile_op mutates each tile's data in place + /// without re-shaping it. \p tile_op must be callable as \c void(value_type&) + /// and thread safe. The mutation tasks chain off the existing tile futures, + /// so they run only after tile construction completes; this call blocks + /// locally until every mutation finishes, so on return all local tiles hold + /// their final data. + /// \tparam fence if Fence::Global, also fences the array's World on exit + /// \param[in] tile_op the in-place per-tile mutator + /// \return the number of tiles mutated + template + std::int64_t for_each_local_tile_inplace(TileOp&& tile_op) { + auto op = make_op_shared_handle(std::forward(tile_op)); + World& w = world(); + std::atomic ndone{0}; + // hold the mutation-task futures so they (and the callbacks below) stay + // alive until every task has run; the futures are not re-set into the + // array -- the mutation happens behind the existing tile futures. + std::vector> done; + for (const auto& index : *(pmap())) { + if (is_zero(index)) continue; + Future& fut = find_local(index); + Future mutated = w.taskq.add( + [op](value_type& tile) -> value_type { + op(tile); + return tile; + }, + fut); + mutated.register_callback( + new detail::IncrementCounter>(ndone)); + done.emplace_back(std::move(mutated)); + } + const std::int64_t ntiles = static_cast(done.size()); + if (ntiles > 0) + w.await([&ndone, ntiles]() { return ndone.load() == ntiles; }); + if constexpr (fence == Fence::Global) w.gop.fence(); + return ntiles; + } + + /// Builds one slab-backed arena tensor-of-tensors outer tile. + + /// Engine behind the arena-ToT paths of \c init_elements and \c set: + /// \p cell_source(ordinal) returns a freestanding tensor whose range sizes + /// inner cell \p ordinal and whose data fills it. The slab is allocated by + /// \c detail::make_nested_tile and each cell deep-copies its source. + /// \param[in] outer_range the outer tile's range + /// \param[in] cell_source maps a cell ordinal to its source tensor + template + static value_type make_arena_nested_tile(const TiledArray::Range& outer_range, + CellSource&& cell_source) { + using InnerRange = typename element_type::range_type; + return detail::make_nested_tile( + outer_range, + [&](const auto& idx) -> InnerRange { + // the inner-cell range type is built from an extent list -- it is + // not constructible from a foreign range type + const auto& src = cell_source(outer_range.ordinal(idx)).range(); + const auto& src_ext = src.extent(); + std::vector ext(src.rank()); + for (std::size_t d = 0; d < src.rank(); ++d) + ext[d] = static_cast(src_ext[d]); + return InnerRange(ext); + }, + [&](auto& cell, const auto& idx) { + cell = cell_source(outer_range.ordinal(idx)); + }); + } + /// Code factorization of the actual assert for the other overloads void assert_pimpl() const { TA_ASSERT(pimpl_ && diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 8196803152..45229f89c6 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -7,6 +7,7 @@ #include "TiledArray/einsum/range.h" #include "TiledArray/expressions/fwd.h" #include "TiledArray/fwd.h" +#include "TiledArray/tensor/arena_einsum.h" #include "TiledArray/tiled_range.h" #include "TiledArray/tiled_range1.h" @@ -240,6 +241,34 @@ void replicate_tensor(Tensor &to, Tensor const &from) { // number of elements to be copied // (same as the number of elements in @c from) auto const N = from.range().volume(); + + if constexpr (TiledArray::is_arena_tensor_v) { + // arena ToT: an inner cell is an 8-byte view into the outer tile's slab. + // A plain std::copy of cells would leave `to` aliasing `from`'s slab -- + // dangling once `from` is gone. Build `to` as a fresh slab-backed tile + // and deep-copy each replicated inner cell's element data. + using inner_t = typename Tensor::value_type; + using inner_range_t = typename inner_t::range_type; + using elem_t = typename inner_t::value_type; + const auto out_range = to.range(); + const std::size_t M = out_range.volume(); + auto range_fn = [&from, N](std::size_t ord) -> inner_range_t { + const auto &src = from.data()[ord % N]; + return src.empty() ? inner_range_t{} : src.range(); + }; + to = detail::arena_outer_init(out_range, 1, range_fn, + alignof(elem_t), /*zero_init=*/false); + for (std::size_t ord = 0; ord < M; ++ord) { + auto &dst = to.data()[ord]; + if (dst.empty()) continue; + const auto &src = from.data()[ord % N]; + const elem_t *s = src.data(); + elem_t *d = dst.data(); + for (std::size_t k = 0; k < dst.size(); ++k) d[k] = s[k]; + } + return; + } + for (auto i = 0; i < to.range().volume(); i += N) std::copy(from.begin(), from.end(), to.data() + i); } @@ -616,6 +645,14 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ::Einsum::index::permutation; using TiledArray::Permutation; + // Temporary sub-Worlds used by the generalized-contraction path below. + // Declared before AB/C so it is destroyed *after* them: an ArrayTerm's + // `.ei` member is a DistArray bound to one of these sub-Worlds, and + // ~DistArray -> lazy_deleter dereferences that World. If a sub-World + // outlived only by `worlds` were torn down first, that deref would hit a + // dead World (e.g. while unwinding an exception thrown mid-contraction). + std::vector> worlds; + std::tuple, ArrayTerm> AB{{A.array(), a}, {B.array(), b}}; @@ -686,6 +723,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::is_same_v); constexpr bool is_tot = detail::is_tensor_v; + // A non-owning view inner cell (e.g. ArenaTensor) has no value-returning + // per-cell product; the legacy element-op path below cannot run for it. + constexpr bool inner_is_view = TiledArray::is_tensor_view_v; auto element_hadamard_op = (is_tot && inner.h) ? std::make_optional( @@ -717,6 +757,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, auto pa = A.permutation; auto pb = B.permutation; + auto arena_plan = detail::make_regime_a_arena_plan( + A, B, inner, /*inner_perm=*/C.permutation); for (Index h : H.tiles) { auto const pc = C.permutation; auto const c = apply(pc, h); @@ -725,6 +767,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } + if (detail::run_regime_a_arena(arena_plan, h, batch, A, B, C, + C_local_tiles, tiles, trange)) + continue; ResultTensor tile(TiledArray::Range{batch}, typename ResultTensor::value_type{}); for (Index i : tiles) { @@ -743,17 +788,28 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t k = 0; k < batch; ++k) { using Ix = ::Einsum::Index; if constexpr (AreArrayToT) { - auto aik = ai.batch(k); - auto bik = bi.batch(k); - auto vol = aik.total_size(); - TA_ASSERT(vol == bik.total_size()); - - auto &el = tile({k}); - using TensorT = std::remove_reference_t; - - for (auto i = 0; i < vol; ++i) - add_to(el, element_product_op(aik.data()[i], bik.data()[i])); - + if constexpr (inner_is_view) { + // View inner cells (e.g. ArenaTensor) have no value-returning + // per-cell product; only run_regime_a_arena can produce them. + // Reaching this legacy path means the arena plan was inactive + // -- typically a permuted inner contraction (see + // TODO(arena-einsum-perm) in arena_einsum.h). + TA_EXCEPTION( + "TA::einsum: ToT x ToT product with view inner cells " + "(e.g. ArenaTensor) is supported only via the regime-A " + "arena fast path, which was inactive for this expression " + "(likely a permuted inner contraction)"); + } else { + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + + auto &el = tile({k}); + + for (auto i = 0; i < vol; ++i) + add_to(el, element_product_op(aik.data()[i], bik.data()[i])); + } } else if constexpr (!AreArraySame) { auto aik = ai.batch(k); auto bik = bi.batch(k); @@ -762,11 +818,15 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, auto &el = tile({k}); + // Fused `el += inner_tensor * scalar` -- no scaled temporary + // (axpy_to works in-place, so it also supports view inner + // cells that cannot value-return a scaled tensor). + using TiledArray::axpy_to; for (auto i = 0; i < vol; ++i) if constexpr (IsArrayToT) { - add_to(el, scale(aik.data()[i], bik.data()[i])); + axpy_to(el, aik.data()[i], bik.data()[i]); } else { - add_to(el, scale(bik.data()[i], aik.data()[i])); + axpy_to(el, bik.data()[i], aik.data()[i]); } } else { @@ -819,8 +879,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::invoke(update_tr, std::get<0>(AB)); std::invoke(update_tr, std::get<1>(AB)); - std::vector> worlds; - // iterates over tiles of hadamard indices for (Index h : H.tiles) { auto &[A, B] = AB; diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 946bf431b6..ee2f721aa3 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -128,6 +129,17 @@ class ContEngine : public BinaryEngine { const right_tile_element_type&)> element_return_op_; ///< Same as element_nonreturn_op_ but returns ///< the result + std::function + arena_hadamard_tile_op_; ///< Whole-tile op for a Hadamard-outer + + ///< contraction-inner product on arena + ///< (view-inner-cell) ToT tiles, where a + ///< value-returning per-cell op cannot be + ///< used; null otherwise + using arena_plan_storage_t = + TiledArray::detail::arena_plan_storage_t; + TA_NO_UNIQUE_ADDRESS arena_plan_storage_t arena_plan_; TiledArray::detail::ProcGrid proc_grid_; ///< Process grid for the contraction size_type K_ = 1; ///< Inner dimension size @@ -253,7 +265,10 @@ class ContEngine : public BinaryEngine { // 1. if ToT inner tile op has been initialized if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { TA_ASSERT(element_nonreturn_op_); - TA_ASSERT(element_return_op_); + // a view inner cell (e.g. ArenaTensor) cannot host a value-returning + // inner op, so element_return_op_ is intentionally left null for it + if constexpr (!TiledArray::is_tensor_view_v) + TA_ASSERT(element_return_op_); } // Initialize children @@ -300,7 +315,13 @@ class ContEngine : public BinaryEngine { // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - total_perm, this->element_nonreturn_op_); + total_perm, this->element_nonreturn_op_, + std::move(this->arena_plan_)); + // Plan ownership transferred to op_; mark carrier slot empty so any + // later use of arena_plan_ reads as "no plan" rather than moved-from. + if constexpr (!std::is_same_v) { + this->arena_plan_.reset(); + } } trange_ = ContEngine_::make_trange(outer_perm); shape_ = ContEngine_::make_shape(outer_perm); @@ -330,7 +351,13 @@ class ContEngine : public BinaryEngine { // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - total_perm, this->element_nonreturn_op_); + total_perm, this->element_nonreturn_op_, + std::move(this->arena_plan_)); + // Plan ownership transferred to op_; mark carrier slot empty so any + // later use of arena_plan_ reads as "no plan" rather than moved-from. + if constexpr (!std::is_same_v) { + this->arena_plan_.reset(); + } } trange_ = ContEngine_::make_trange(); shape_ = ContEngine_::make_shape(); @@ -513,6 +540,110 @@ class ContEngine : public BinaryEngine { protected: void init_inner_tile_op(const IndexList& inner_target_indices) { + if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { + constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v< + result_tile_type, left_tile_type, right_tile_type>; + if constexpr (tot_x_tot && + TiledArray::is_tensor_view_v) { + // ToT x ToT with non-owning view inner cells (e.g. ArenaTensor). A + // view cell cannot host a value-returning inner op, so the + // owning-cell inner-op builder cannot be used. Two nested products + // are supported here: + // - the elementwise pure Hadamard, where the inner element op is + // unused anyway -- MultEngine::make_tile_op passes none and the + // outer Mult tile op recurses through Tensor::mult -- so + // element_*_op_ is left null; + // - the inner contraction (incl. inner outer-product), routed + // through the arena fast path: it writes results in place into + // pre-shaped view cells, so only element_nonreturn_op_ is needed. + // Every other nested product is deferred. + const auto inner_prod = this->inner_product_type(); + if (inner_prod == TensorProduct::Hadamard && + this->product_type() == TensorProduct::Hadamard) { + // pure Hadamard: element_*_op_ left null + } else if (inner_prod == TensorProduct::Contraction) { + using op_type = TiledArray::detail::ContractReduce< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type>; + // The inner op is built *perm-free* on purpose. factor_ is absorbed + // into element_nonreturn_op_; operand inner transposes are folded + // into the inner GEMM via left_/right_inner_permtype_. A non-identity + // inner *result* permutation is NOT placed on this op + // (make_fused_contraction_lambda asserts a perm-free op); it is + // applied downstream instead -- by op_'s post-processing permute for + // a contraction outer product, or by arena_hadamard_inner_contract's + // slab-level post-pass for a Hadamard outer product. + auto contrreduce_op = op_type( + to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), this->factor_, + inner_size(this->indices_), inner_size(this->left_indices_), + inner_size(this->right_indices_)); + constexpr bool arena_eligible = + TiledArray::detail::is_contraction_arena_tot_v< + result_tile_type, left_tile_type, right_tile_type>; + if constexpr (!arena_eligible) { + TA_EXCEPTION( + "nested contraction on view inner tiles is supported only " + "for arena-backed tensors-of-tensors"); + } else { + // perm-free per-cell in-place contraction; used by both outer + // regimes below + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_contraction_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(contrreduce_op); + if (this->product_type() == TensorProduct::Contraction) { + // outer contraction: the SUMMA result is shaped from operand + // inner cells by arena_plan_; op_'s post-processing permute + // applies the (outer + inner) result permutation. + this->arena_plan_ = + TiledArray::detail::make_contraction_arena_plan< + result_tile_type, left_tile_type, right_tile_type>( + TiledArray::detail::ArenaInnerShapeKind:: + gemm_result_range, + std::make_optional(contrreduce_op.gemm_helper()), + Permutation{}); + if (!bool(this->arena_plan_)) + TA_EXCEPTION( + "nested contraction on view inner tiles: the arena fast " + "path was inactive (arena disabled)"); + } else { + // outer Hadamard: MultEngine builds a binary tile op, which + // cannot use a value-returning per-cell op. Supply a whole-tile + // arena op that shapes the result from per-cell inner GEMMs and + // fills it in place; the inner result permutation is a + // slab-level post-pass inside the kernel. + this->arena_hadamard_tile_op_ = + [cell_op = this->element_nonreturn_op_, + inner_gh = contrreduce_op.gemm_helper(), + inner_perm = inner(this->perm_)]( + const left_tile_type& l, + const right_tile_type& r) -> result_tile_type { + return TiledArray::detail::arena_hadamard_inner_contract< + result_tile_type>(l, r, inner_gh, cell_op, inner_perm); + }; + } + } + // element_return_op_ left null: a view cell cannot be + // value-returned (see the init_struct precondition check). + } else { + TA_EXCEPTION( + "nested non-contraction product on view inner tiles (e.g. " + "ArenaTensor) is not yet supported; only the elementwise " + "Hadamard product and the inner contraction are"); + } + } else { + init_inner_tile_op_owning_(inner_target_indices); + } + } + } + + /// Builds the inner-cell element op (element_nonreturn_op_ / + /// element_return_op_) for a nested-tensor expression. init_inner_tile_op + /// dispatches every case here except ToT x ToT with non-owning view inner + /// cells -- a view cell cannot host the value-returning inner ops this + /// builder constructs. + void init_inner_tile_op_owning_(const IndexList& inner_target_indices) { if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v< result_tile_type, left_tile_type, right_tile_type>; @@ -541,17 +672,52 @@ class ContEngine : public BinaryEngine { this->factor_, inner_size(this->indices_), inner_size(this->left_indices_), inner_size(this->right_indices_)); - this->element_nonreturn_op_ = - [contrreduce_op, permute_inner = this->product_type() != - TensorProduct::Contraction]( - result_tile_element_type& result, - const left_tile_element_type& left, - const right_tile_element_type& right) { - contrreduce_op(result, left, right); - // permutations of result are applied as "postprocessing" - if (permute_inner && !TA::empty(result)) - result = contrreduce_op(result); - }; + constexpr bool arena_eligible = + TiledArray::detail::is_contraction_arena_tot_v< + result_tile_type, left_tile_type, right_tile_type>; + if constexpr (arena_eligible) { + if (this->product_type() == TensorProduct::Contraction) { + this->arena_plan_ = + TiledArray::detail::make_contraction_arena_plan< + result_tile_type, left_tile_type, right_tile_type>( + TiledArray::detail::ArenaInnerShapeKind:: + gemm_result_range, + std::make_optional(contrreduce_op.gemm_helper()), + inner(this->perm_)); + } + } + if constexpr (arena_eligible) { + if (this->arena_plan_) { + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_contraction_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(contrreduce_op); + } else { + this->element_nonreturn_op_ = + [contrreduce_op, permute_inner = this->product_type() != + TensorProduct::Contraction]( + result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + contrreduce_op(result, left, right); + // permutations of result are applied as "postprocessing" + if (permute_inner && !TA::empty(result)) + result = contrreduce_op(result); + }; + } + } else { + this->element_nonreturn_op_ = + [contrreduce_op, permute_inner = this->product_type() != + TensorProduct::Contraction]( + result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + contrreduce_op(result, left, right); + // permutations of result are applied as "postprocessing" + if (permute_inner && !TA::empty(result)) + result = contrreduce_op(result); + }; + } } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { TA_ASSERT(tot_x_tot); @@ -574,26 +740,69 @@ class ContEngine : public BinaryEngine { ? inner(this->perm_) : Permutation{}) : op_type(base_op_type()); - this->element_nonreturn_op_ = - [mult_op, outer_prod](result_tile_element_type& result, - const left_tile_element_type& left, - const right_tile_element_type& right) { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { // outer_prod == TensorProduct::Contraction - // there is currently no fused MultAdd ternary Op, only Add - // and Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) + constexpr bool arena_eligible_h_unit = + TiledArray::detail::is_contraction_arena_tot_v< + result_tile_type, left_tile_type, right_tile_type>; + if constexpr (arena_eligible_h_unit) { + if (this->product_type() == TensorProduct::Contraction) { + this->arena_plan_ = + TiledArray::detail::make_contraction_arena_plan< + result_tile_type, left_tile_type, right_tile_type>( + TiledArray::detail::ArenaInnerShapeKind::left_range, + std::nullopt, inner(this->perm_)); + } + } + if constexpr (arena_eligible_h_unit) { + if (this->arena_plan_) { + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_hadamard_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(); + } else { + this->element_nonreturn_op_ = + [mult_op, outer_prod]( + result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { // outer_prod == TensorProduct::Contraction + // there is currently no fused MultAdd ternary Op, only + // Add and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } + } else { + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); + else { // outer_prod == TensorProduct::Contraction + // there is currently no fused MultAdd ternary Op, only + // Add and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } } - } - }; + }; + } } else { using base_op_type = TiledArray::detail::ScalMult< result_tile_element_type, left_tile_element_type, @@ -607,26 +816,69 @@ class ContEngine : public BinaryEngine { ? inner(this->perm_) : Permutation{}) : op_type(base_op_type(this->factor_)); - this->element_nonreturn_op_ = - [mult_op, outer_prod](result_tile_element_type& result, - const left_tile_element_type& left, - const right_tile_element_type& right) { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - // there is currently no fused MultAdd ternary Op, only Add - // and Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) + constexpr bool arena_eligible_h_scaled = + TiledArray::detail::is_contraction_arena_tot_v< + result_tile_type, left_tile_type, right_tile_type>; + if constexpr (arena_eligible_h_scaled) { + if (this->product_type() == TensorProduct::Contraction) { + this->arena_plan_ = + TiledArray::detail::make_contraction_arena_plan< + result_tile_type, left_tile_type, right_tile_type>( + TiledArray::detail::ArenaInnerShapeKind::left_range, + std::nullopt, inner(this->perm_)); + } + } + if constexpr (arena_eligible_h_scaled) { + if (this->arena_plan_) { + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_hadamard_scaled_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(this->factor_); + } else { + this->element_nonreturn_op_ = + [mult_op, outer_prod]( + result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + // there is currently no fused MultAdd ternary Op, only + // Add and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } + } else { + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) result = mult_op(left, right); else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); + // there is currently no fused MultAdd ternary Op, only + // Add and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } } - } - }; + }; + } } } // ToT x T or T x ToT } else if (inner_prod == TensorProduct::Scale) { @@ -640,44 +892,81 @@ class ContEngine : public BinaryEngine { right_tile_type> && TiledArray::detail::is_tensor_v; if constexpr (tot_x_t || t_x_tot) { - auto scal_op = [perm = !this->implicit_permute_inner_ - ? inner(this->perm_) - : Permutation{}]( - const left_tile_element_type& left, - const right_tile_element_type& right) - -> result_tile_element_type { - using TiledArray::scale; - if constexpr (tot_x_t) { - if (perm) - return scale(left, right, perm); - else - return scale(left, right); - } else if constexpr (t_x_tot) { - if (perm) - return scale(right, left, perm); - else - return scale(right, left); - } else - abort(); // unreachable + constexpr auto kind = + tot_x_t ? TiledArray::detail::ArenaInnerShapeKind::left_range + : TiledArray::detail::ArenaInnerShapeKind::right_range; + constexpr bool arena_eligible_scale = + TiledArray::detail::is_contraction_arena_tot_v< + result_tile_type, left_tile_type, right_tile_type>; + if constexpr (arena_eligible_scale) { + if (this->product_type() == TensorProduct::Contraction) { + this->arena_plan_ = + TiledArray::detail::make_contraction_arena_plan< + result_tile_type, left_tile_type, right_tile_type>( + kind, std::nullopt, inner(this->perm_)); + } + } + // Fallback per-element op for the scale inner-product when no + // arena plan is in play. The Contraction outer product is the + // fused AXPY `result += (perm ^ tot) * scalar` -- no scaled + // temporary, so it works uniformly for owning and view inner + // cells. The Hadamard outer product is an assignment + // `result = (perm ^ tot) * scalar`, which needs value-returning + // `scale`; only owning inner cells support it. + auto fallback_op = [perm = !this->implicit_permute_inner_ + ? inner(this->perm_) + : Permutation{}, + outer_prod = this->product_type()]( + result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + if (outer_prod == TensorProduct::Contraction) { + using TiledArray::axpy_to; + if constexpr (tot_x_t) { + if (perm) + axpy_to(result, left, right, perm); + else + axpy_to(result, left, right); + } else { + if (perm) + axpy_to(result, right, left, perm); + else + axpy_to(result, right, left); + } + } else { + if constexpr (!TiledArray::is_tensor_view_v< + result_tile_element_type>) { + using TiledArray::scale; + if constexpr (tot_x_t) + result = perm ? scale(left, right, perm) : scale(left, right); + else + result = perm ? scale(right, left, perm) : scale(right, left); + } else { + TA_EXCEPTION( + "Tensor scale-inner Hadamard-outer product: a " + "view result cell cannot be value-assigned a fresh " + "scaled tensor"); + } + } }; - this->element_nonreturn_op_ = - [scal_op, outer_prod = (this->product_type())]( - result_tile_element_type& result, - const left_tile_element_type& left, - const right_tile_element_type& right) { - if (outer_prod == TensorProduct::Contraction) { - // TODO implement X-permuting AXPY - if (empty(result)) - result = scal_op(left, right); - else { - auto result_increment = scal_op(left, right); - add_to(result, result_increment); - } - // result += scal_op(left, right); - } else { - result = scal_op(left, right); - } - }; + if constexpr (arena_eligible_scale) { + if (this->arena_plan_) { + if constexpr (tot_x_t) + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_scale_tot_x_t_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(); + else + this->element_nonreturn_op_ = + TiledArray::detail::make_fused_scale_t_x_tot_lambda< + result_tile_element_type, left_tile_element_type, + right_tile_element_type>(); + } else { + this->element_nonreturn_op_ = fallback_op; + } + } else { + this->element_nonreturn_op_ = fallback_op; + } } } else abort(); // unsupported TensorProduct type diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 84d11bd4c0..f0942ee48d 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -408,7 +408,16 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->element_return_op_)); + if constexpr (TiledArray::is_tensor_view_v< + typename value_type::value_type>) { + // arena ToT: a view inner cell cannot host a value-returning + // per-cell op, so delegate the whole tile product to the arena op + // built in init_inner_tile_op + return op_type(op_base_type(typename op_base_type::tile_op_tag{}, + this->arena_hadamard_tile_op_)); + } else { + return op_type(op_base_type(this->element_return_op_)); + } } else if (inner_prod == TensorProduct::Scale) { return op_type(op_base_type()); } else @@ -438,13 +447,30 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { // inner permutation, if needed, was fused into inner op, do not apply // inner part of the perm again - return op_type(op_base_type(this->element_return_op_), - outer(std::forward(perm))); + if constexpr (TiledArray::is_tensor_view_v< + typename value_type::value_type>) { + return op_type(op_base_type(typename op_base_type::tile_op_tag{}, + this->arena_hadamard_tile_op_), + outer(std::forward(perm))); + } else { + return op_type(op_base_type(this->element_return_op_), + outer(std::forward(perm))); + } } else if (inner_prod == TensorProduct::Scale) { - // inner permutation, if needed, was fused into inner op, do not apply - // inner part of the perm again - return op_type(op_base_type(this->element_return_op_), - outer(std::forward(perm))); + if constexpr (TiledArray::is_tensor_view_v< + typename value_type::value_type>) { + // arena ToT: a view result cell cannot be value-assigned a scaled + // tensor, so the element_return_op_ path is unusable. Route through + // the arena-aware mult CPO with the full permutation instead -- it + // shapes and fills the result tile as a unit and applies the + // (outer + inner) result permutation in place. + return op_type(op_base_type(), std::forward(perm)); + } else { + // inner permutation, if needed, was fused into inner op, do not + // apply inner part of the perm again + return op_type(op_base_type(this->element_return_op_), + outer(std::forward(perm))); + } } else abort(); } else { // plain tensor diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h index d8841a8596..009d32f9b2 100644 --- a/src/TiledArray/external/btas.h +++ b/src/TiledArray/external/btas.h @@ -1223,6 +1223,9 @@ namespace TiledArray { namespace detail { template struct ta_ops_match_tensor<::btas::Tensor> : std::false_type {}; +template +struct ta_ops_match_tensor_inplace<::btas::Tensor> + : std::false_type {}; } // namespace detail } // namespace TiledArray @@ -1238,6 +1241,12 @@ template inline constexpr bool ta_ops_match_tensor_v = ::TiledArray::detail::is_btas_tensor_v< ::TiledArray::detail::remove_cvr_t>; +// btas::Tensor is freestanding (owning); the compound-assignment predicate is +// identical to the value-returning one. +template +inline constexpr bool ta_ops_match_tensor_inplace_v = + ::TiledArray::detail::is_btas_tensor_v< + ::TiledArray::detail::remove_cvr_t>; } // namespace detail #include diff --git a/src/TiledArray/replicator.h b/src/TiledArray/replicator.h index 52ae446af1..8794954a20 100644 --- a/src/TiledArray/replicator.h +++ b/src/TiledArray/replicator.h @@ -166,8 +166,12 @@ class Replicator : public madness::WorldObject >, // Generate a list of local tiles from other. typename A::pmap_interface::const_iterator end = source.pmap()->end(); typename A::pmap_interface::const_iterator it = source.pmap()->begin(); - indices_.reserve(source.pmap()->local_size()); - data_.reserve(source.pmap()->local_size()); + // local_size() is only a reserve() hint; some pmaps (e.g. HashPmap) do + // not precompute it -- skip the hint rather than assert. + if (source.pmap()->known_local_size()) { + indices_.reserve(source.pmap()->local_size()); + data_.reserve(source.pmap()->local_size()); + } if (source.is_dense()) { // When dense, all tiles are present for (; it != end; ++it) { diff --git a/src/TiledArray/tensor/arena.h b/src/TiledArray/tensor/arena.h new file mode 100644 index 0000000000..b37b962436 --- /dev/null +++ b/src/TiledArray/tensor/arena.h @@ -0,0 +1,160 @@ +/// Arena implementation +#ifndef TILEDARRAY_TENSOR_ARENA_H__INCLUDED +#define TILEDARRAY_TENSOR_ARENA_H__INCLUDED + +#include "TiledArray/config.h" +#include "TiledArray/error.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace TiledArray { +namespace detail { + +/// Kill switch: when true, hooks fall back to the legacy heap path. +inline bool& arena_disabled() { + static bool flag = false; + return flag; +} + +/// One-shot bump allocator; slab is co-owned via aliasing shared_ptrs. +class Arena { + public: + explicit Arena( + std::pmr::memory_resource* mr = std::pmr::new_delete_resource()) noexcept + : resource_(mr) { + TA_ASSERT(resource_ != nullptr); + } + + Arena(const Arena&) = delete; + Arena& operator=(const Arena&) = delete; + Arena(Arena&&) noexcept = default; + Arena& operator=(Arena&&) noexcept = default; + ~Arena() = default; + + /// Allocate the slab once; zero_init clears it for accumulation kernels. + /// `alignment` (default `alignof(std::max_align_t)`) is the alignment of + /// the slab base; pass a larger power-of-two when callers need SIMD-aligned + /// element pointers at known interior offsets. + void reserve(std::size_t bytes, bool zero_init = false, + std::size_t alignment = alignof(std::max_align_t)) { + TA_ASSERT(capacity_ == 0); + TA_ASSERT(bytes > 0); + TA_ASSERT(alignment >= alignof(std::max_align_t)); + TA_ASSERT((alignment & (alignment - 1)) == 0); + void* raw = resource_->allocate(bytes, alignment); + auto* mr = resource_; + auto deleter = [mr, bytes, alignment](std::byte* p) noexcept { + mr->deallocate(p, bytes, alignment); + }; + slab_ = std::shared_ptr(static_cast(raw), + std::move(deleter)); + capacity_ = bytes; + cursor_ = 0; + if (zero_init) std::memset(slab_.get(), 0, bytes); + } + + /// Aliasing view at a caller-aligned offset. + template + std::shared_ptr slice(std::size_t offset, std::size_t /*n_elem*/) const { + TA_ASSERT(slab_); + TA_ASSERT(offset % alignof(T) == 0); + TA_ASSERT(offset <= capacity_); + auto* p = reinterpret_cast(slab_.get() + offset); + return std::shared_ptr(slab_, p); + } + + /// Bump-allocate n elements of T; result is T-aligned. + template + std::shared_ptr claim(std::size_t n) { + TA_ASSERT(slab_); + auto base = reinterpret_cast(slab_.get() + cursor_); + auto aligned = (base + alignof(T) - 1) & ~(alignof(T) - 1); + std::size_t pad = static_cast(aligned - base); + std::size_t consumed = pad + n * sizeof(T); + TA_ASSERT(cursor_ + consumed <= capacity_); + cursor_ += consumed; + return std::shared_ptr(slab_, reinterpret_cast(aligned)); + } + + std::size_t capacity() const noexcept { return capacity_; } + std::size_t cursor() const noexcept { return cursor_; } + std::size_t remaining() const noexcept { return capacity_ - cursor_; } + bool empty() const noexcept { return cursor_ == 0; } + std::pmr::memory_resource* resource() const noexcept { return resource_; } + + private: + std::pmr::memory_resource* resource_; + std::shared_ptr slab_; + std::size_t capacity_ = 0; + std::size_t cursor_ = 0; +}; + +/// Per-cell offsets and total slab size produced by plan(). +struct ArenaPlan { + std::vector offsets; + std::size_t total_bytes = 0; +}; + +/// Cache-line-floor alignment used by production callers. +inline constexpr std::size_t kArenaCachelineAlign = 128; + +/// Round bytes up to a power-of-two alignment. +inline std::size_t arena_align_up(std::size_t bytes, + std::size_t alignment) noexcept { + return (bytes + alignment - 1) & ~(alignment - 1); +} + +/// Pre-walk cells once to compute offsets and total bytes. +template +ArenaPlan plan(std::size_t N_cells, ShapeFn&& shape_fn, + std::size_t element_size, std::size_t alignment) { + ArenaPlan out; + out.offsets.resize(N_cells); + std::size_t total = 0; + for (std::size_t ord = 0; ord < N_cells; ++ord) { + out.offsets[ord] = total; + auto&& r = shape_fn(ord); + std::size_t bytes = r.volume() * element_size; + total += arena_align_up(bytes, alignment); + } + out.total_bytes = total; + return out; +} + +/// PMR adapter over an Arena; deallocation is a no-op (slab-owned lifetime). +class ArenaResource final : public std::pmr::memory_resource { + public: + explicit ArenaResource(Arena* arena) noexcept : arena_(arena) { + TA_ASSERT(arena != nullptr); + } + + Arena* arena() const noexcept { return arena_; } + + protected: + void* do_allocate(std::size_t bytes, std::size_t alignment) override { + auto h = arena_->claim(arena_align_up(bytes, alignment)); + return h.get(); + } + + void do_deallocate(void* /*p*/, std::size_t /*bytes*/, + std::size_t /*alignment*/) override {} + + bool do_is_equal( + const std::pmr::memory_resource& other) const noexcept override { + return this == &other; + } + + private: + Arena* arena_; +}; + +} // namespace detail +} // namespace TiledArray + +#endif diff --git a/src/TiledArray/tensor/arena_einsum.h b/src/TiledArray/tensor/arena_einsum.h new file mode 100644 index 0000000000..7e917e2bf8 --- /dev/null +++ b/src/TiledArray/tensor/arena_einsum.h @@ -0,0 +1,793 @@ +/// Arena-aware ToT einsum: plans, fused kernels, and dispatch. + +#ifndef TILEDARRAY_TENSOR_ARENA_EINSUM_H__INCLUDED +#define TILEDARRAY_TENSOR_ARENA_EINSUM_H__INCLUDED + +#include "TiledArray/error.h" +#include "TiledArray/math/gemm_helper.h" +#include "TiledArray/permutation.h" +#include "TiledArray/tensor/arena.h" +#include "TiledArray/tensor/arena_kernels.h" +#include "TiledArray/tensor/kernels.h" +#include "TiledArray/tensor/type_traits.h" + +#include +#include +#include +#include + +#if defined(_MSC_VER) && _MSC_VER < 1937 // VS 2022 < 17.7 +#define TA_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]] +#else +#define TA_NO_UNIQUE_ADDRESS [[no_unique_address]] +#endif + +namespace TiledArray::detail { + +/// Specifies how an inner-cell range is derived from operand inner cells. +enum class ArenaInnerShapeKind { + left_range, // Hadamard inner; Scale tot_x_t + right_range, // Scale t_x_tot + gemm_result_range // inner Contraction (uses inner_gh) +}; + +/// Inner-shape derivation plan: kind + (optional) inner GemmHelper. +struct ArenaInnerShapePlan { + ArenaInnerShapeKind kind; + std::optional inner_gh; // only for gemm_result_range + + /// Derives one result inner range from operand inner cells. + template + ResultInnerRange make(const LInner& l, const RInner& r) const { + switch (kind) { + case ArenaInnerShapeKind::left_range: + return l.range(); + case ArenaInnerShapeKind::right_range: + return r.range(); + case ArenaInnerShapeKind::gemm_result_range: + TA_ASSERT(inner_gh.has_value()); + return inner_gh->template make_result_range( + l.range(), r.range()); + } + TA_ASSERT(false); + return ResultInnerRange{}; + } +}; + +/// Derives result ranges and constructs non-empty inner cells in one arena +/// slab. +template +class ContractionArenaPlan { + public: + /// Stores the inner shape plan used to construct result cells. + explicit ContractionArenaPlan(ArenaInnerShapePlan p) + : inner_plan_(std::move(p)) {} + + /// Constructs a result tile whose non-empty inner cells alias arena storage. + Result reserve_and_construct(const Left& left, const Right& right, + const math::GemmHelper& outer_gh) const; + + /// Grows an already-constructed result tile in place so it covers every + /// inner cell implied by this `left`/`right` K-panel. A SUMMA reduction + /// shapes the result from its first K-panel only; a later panel of a + /// contracted-dimension-sparse ToT operand can touch inner cells the first + /// panel left null, so each subsequent panel must extend the result. + void grow_to_cover(Result& result, const Left& left, const Right& right, + const math::GemmHelper& outer_gh) const; + + private: + /// Per-output-cell inner ranges implied by one `left`/`right` K-panel. + /// Deduced return type: spelling `Result::value_type::range_type` in the + /// declaration would make the whole class ill-formed for a non-ToT + /// `Result`, but `make_contraction_arena_plan` names this class in its + /// return type unconditionally (and returns nullopt for non-ToT). + auto operand_inner_ranges(const Left& left, const Right& right, + const math::GemmHelper& outer_gh) const; + + ArenaInnerShapePlan inner_plan_{}; +}; + +/// True when `T` is a `TA::Tensor` outer whose inner cells the arena +/// machinery knows how to allocate (legacy `TA::Tensor` ToT inner or the +/// pinned-view `ArenaTensor`). Doesn't require `is_tensor_of_tensor_v` -- +/// `ArenaTensor` is deliberately not registered as `is_tensor_helper`, so +/// trait propagation can't reach it that way. +template +inline constexpr bool is_arena_eligible_outer_v = + is_ta_tensor_v && + (is_ta_tensor_v || + ::TiledArray::is_arena_tensor_v); + +/// True when `T` is an inner-cell type that the arena machinery treats as +/// tensor-shaped (as opposed to a scalar in mixed Scale ops). Covers the +/// legacy `TA::Tensor` inner and the pinned `ArenaTensor`. Used by the +/// regime-A `accumulate` dispatch to distinguish the tensor-inner branches +/// from the scalar-inner ones in `scale_left`/`scale_right` cases. +template +inline constexpr bool is_arena_inner_cell_v = + is_ta_tensor_v || ::TiledArray::is_arena_tensor_v; + +/// True when the result is an arena-eligible outer; gates the arena +/// allocation path in cont_engine. +template +inline constexpr bool is_contraction_arena_tot_v = + is_arena_eligible_outer_v; + +/// Stores an arena plan for ToT results and std::monostate otherwise. +template +using arena_plan_storage_t = + std::conditional_t, + std::optional>, + std::monostate>; + +/// Builds a contraction arena plan when the result and inner permutation allow +/// it. +template +auto make_contraction_arena_plan(ArenaInnerShapeKind inner_kind, + std::optional inner_gh, + const Permutation& inner_perm) + -> std::optional> { + if (arena_disabled()) return std::nullopt; + if constexpr (!is_contraction_arena_tot_v) { + return std::nullopt; + } else { + if (bool(inner_perm) && !inner_perm.is_identity()) return std::nullopt; + if (inner_kind != ArenaInnerShapeKind::gemm_result_range) + inner_gh.reset(); + else if (!inner_gh.has_value()) + return std::nullopt; + return std::optional>( + std::in_place, ArenaInnerShapePlan{inner_kind, std::move(inner_gh)}); + } +} + +/// Per-output-cell inner ranges implied by one `left`/`right` K-panel. +template +auto ContractionArenaPlan::operand_inner_ranges( + const Left& left, const Right& right, + const math::GemmHelper& outer_gh) const { + using inner_t = typename Result::value_type; + using inner_range_t = typename inner_t::range_type; + using integer = math::blas::integer; + + integer M, N, K; + outer_gh.compute_matrix_sizes(M, N, K, left.range(), right.range()); + const integer lda = (outer_gh.left_op() == math::blas::NoTranspose) ? K : M; + const integer ldb = (outer_gh.right_op() == math::blas::NoTranspose) ? N : K; + TA_ASSERT(left.nbatch() == right.nbatch()); + const std::size_t batch_sz = static_cast(left.nbatch()); + const std::size_t mn = + static_cast(M) * static_cast(N); + + auto range_for = [&](std::size_t ord) -> inner_range_t { + if (mn == 0) return inner_range_t{}; + const integer b = static_cast(ord / mn); + const integer rem = static_cast(ord % mn); + const integer m = rem / N; + const integer n = rem % N; + + if (inner_plan_.kind == ArenaInnerShapeKind::left_range) { + if constexpr (is_arena_eligible_outer_v) { + const auto* lbase = left.batch_data(static_cast(b)); + for (integer k = 0; k != K; ++k) { + const auto aoff = (outer_gh.left_op() == math::blas::NoTranspose) + ? m * lda + k + : k * lda + m; + const auto& lc = *(lbase + aoff); + if (!lc.empty()) return lc.range(); + } + } + return inner_range_t{}; + } + if (inner_plan_.kind == ArenaInnerShapeKind::right_range) { + if constexpr (is_arena_eligible_outer_v) { + const auto* rbase = right.batch_data(static_cast(b)); + for (integer k = 0; k != K; ++k) { + const auto boff = (outer_gh.right_op() == math::blas::NoTranspose) + ? k * ldb + n + : n * ldb + k; + const auto& rc = *(rbase + boff); + if (!rc.empty()) return rc.range(); + } + } + return inner_range_t{}; + } + // gemm_result_range needs both operands to be ToT. + if constexpr (is_arena_eligible_outer_v && + is_arena_eligible_outer_v) { + const auto* lbase = left.batch_data(static_cast(b)); + const auto* rbase = right.batch_data(static_cast(b)); + for (integer k = 0; k != K; ++k) { + const auto aoff = (outer_gh.left_op() == math::blas::NoTranspose) + ? m * lda + k + : k * lda + m; + const auto boff = (outer_gh.right_op() == math::blas::NoTranspose) + ? k * ldb + n + : n * ldb + k; + const auto& lc = *(lbase + aoff); + const auto& rc = *(rbase + boff); + if (lc.empty() || rc.empty()) continue; + return inner_plan_.template make(lc, rc); + } + } + return inner_range_t{}; + }; + + std::vector ranges; + const std::size_t N_cells = mn * batch_sz; + ranges.reserve(N_cells); + for (std::size_t ord = 0; ord < N_cells; ++ord) + ranges.emplace_back(range_for(ord)); + return ranges; +} + +/// Reserves arena storage and constructs the result tensor-of-tensor tile. +template +Result ContractionArenaPlan::reserve_and_construct( + const Left& left, const Right& right, + const math::GemmHelper& outer_gh) const { + using inner_range_t = typename Result::value_type::range_type; + auto outer_range = + outer_gh.template make_result_range( + left.range(), right.range()); + TA_ASSERT(left.nbatch() == right.nbatch()); + const std::size_t batch_sz = static_cast(left.nbatch()); + const auto ranges = operand_inner_ranges(left, right, outer_gh); + // arena_outer_init dispatches internally on the inner-cell type. + return detail::arena_outer_init( + outer_range, batch_sz, + [&ranges](std::size_t ord) -> inner_range_t { return ranges[ord]; }); +} + +/// Grows an already-constructed result tile to cover this K-panel's cells. +template +void ContractionArenaPlan::grow_to_cover( + Result& result, const Left& left, const Right& right, + const math::GemmHelper& outer_gh) const { + using inner_range_t = typename Result::value_type::range_type; + const auto ranges = operand_inner_ranges(left, right, outer_gh); + detail::arena_tot_grow_inplace( + result, + [&ranges](std::size_t ord) -> inner_range_t { return ranges[ord]; }); +} + +/// Accumulates a contraction into an already-allocated result cell. +template +void fused_contraction_inplace(Result& result, const Left& left, + const Right& right, Scalar alpha, + const math::GemmHelper& gh) { + if (left.empty() || right.empty()) return; + TA_ASSERT(!result.empty()); + // Free `gemm` CPO, not the member: `ArenaTensor` (a view) provides only the + // free in-place overload, while `TA::Tensor` is reached via the + // `tile_interface.h` CPO that forwards to its member. + gemm(result, left, right, alpha, gh); +} + +/// Accumulates an elementwise product into an already-allocated result cell. +template +void fused_hadamard_inplace(Result& result, const Left& left, + const Right& right) { + if (left.empty() || right.empty()) return; + TA_ASSERT(!result.empty()); + inplace_tensor_op( + [](typename Result::value_type& MADNESS_RESTRICT r, + const typename Left::value_type& MADNESS_RESTRICT l, + const typename Right::value_type& MADNESS_RESTRICT rr) { + r += l * rr; + }, + result, left, right); +} + +/// Accumulates a scaled elementwise product into an allocated result cell. +template +void fused_hadamard_scaled_inplace(Result& result, const Left& left, + const Right& right, Scalar factor) { + if (left.empty() || right.empty()) return; + TA_ASSERT(!result.empty()); + // Preserve historical grouping: r += (l * rr) * factor. + inplace_tensor_op( + [factor](typename Result::value_type& MADNESS_RESTRICT r, + const typename Left::value_type& MADNESS_RESTRICT l, + const typename Right::value_type& MADNESS_RESTRICT rr) { + r += (l * rr) * factor; + }, + result, left, right); +} + +/// Accumulates a ToT cell scaled by a scalar right operand. +template +void fused_scale_tot_x_t_inplace(Result& result, const Left& left, + const Scalar& s) { + if (left.empty()) return; + TA_ASSERT(!result.empty()); + inplace_tensor_op( + [s](typename Result::value_type& MADNESS_RESTRICT r, + const typename Left::value_type& MADNESS_RESTRICT l) { r += l * s; }, + result, left); +} + +/// Accumulates a ToT right operand scaled by a scalar left operand. +template +void fused_scale_t_x_tot_inplace(Result& result, const Scalar& s, + const Right& right) { + if (right.empty()) return; + TA_ASSERT(!result.empty()); + inplace_tensor_op( + [s](typename Result::value_type& MADNESS_RESTRICT r, + const typename Right::value_type& MADNESS_RESTRICT rr) { + r += rr * s; + }, + result, right); +} + +/// Creates a fused contraction callback. +template +auto make_fused_contraction_lambda(Op contrreduce_op) { + return + [contrreduce_op](Result& result, const Left& left, const Right& right) { + TA_ASSERT(!contrreduce_op.perm()); + fused_contraction_inplace(result, left, right, contrreduce_op.factor(), + contrreduce_op.gemm_helper()); + }; +} + +/// Hadamard-outer, contraction-inner ToT x ToT product into a fresh arena +/// tile. `left` and `right` share the (Hadamard) outer layout; each result +/// outer cell is the inner GEMM of the corresponding left/right inner cells, +/// shaped by `inner_gh`. `cell_op(result_cell, left_cell, right_cell)` runs +/// the per-cell in-place contraction (e.g. the make_fused_contraction_lambda +/// callback). The per-cell op is perm-free; a non-identity `inner_perm` +/// permutes the result cells' inner modes as a slab-level post-pass. +template +Result arena_hadamard_inner_contract(const Left& left, const Right& right, + const math::GemmHelper& inner_gh, + const CellOp& cell_op, + const Permutation& inner_perm) { + using inner_range_t = typename Result::value_type::range_type; + TA_ASSERT(left.range().volume() == right.range().volume()); + TA_ASSERT(left.nbatch() == right.nbatch()); + const std::size_t N_cells = left.range().volume() * left.nbatch(); + auto range_fn = [&left, &right, &inner_gh](std::size_t ord) -> inner_range_t { + const auto& lc = left.data()[ord]; + const auto& rc = right.data()[ord]; + if (lc.empty() || rc.empty()) return inner_range_t{}; + return inner_gh.template make_result_range(lc.range(), + rc.range()); + }; + Result result = + arena_outer_init(left.range(), left.nbatch(), range_fn); + for (std::size_t ord = 0; ord < N_cells; ++ord) { + if (result.data()[ord].empty()) continue; + cell_op(result.data()[ord], left.data()[ord], right.data()[ord]); + } + if (inner_perm && !inner_perm.is_identity()) + result = arena_inner_permute(result, inner_perm); + return result; +} + +/// Creates a fused Hadamard callback. +template +auto make_fused_hadamard_lambda() { + return [](Result& result, const Left& left, const Right& right) { + fused_hadamard_inplace(result, left, right); + }; +} + +/// Creates a fused scaled-Hadamard callback. +template +auto make_fused_hadamard_scaled_lambda(Scalar factor) { + return [factor](Result& result, const Left& left, const Right& right) { + fused_hadamard_scaled_inplace(result, left, right, factor); + }; +} + +/// Creates a fused ToT-times-scalar callback. +template +auto make_fused_scale_tot_x_t_lambda() { + return [](Result& result, const Left& left, const Right& right) { + fused_scale_tot_x_t_inplace(result, left, right); + }; +} + +/// Creates a fused scalar-times-ToT callback. +template +auto make_fused_scale_t_x_tot_lambda() { + return [](Result& result, const Left& left, const Right& right) { + fused_scale_t_x_tot_inplace(result, left, right); + }; +} + +/// Discriminates the per-cell operation used by the arena regime-A path. +enum class RegimeAInnerKind { + hadamard, + contraction, + scale_left, // ToT × plain T → ToT (right operand contributes scalars) + scale_right // plain T × ToT → ToT (left operand contributes scalars) +}; + +/// Permute the extents of `src` by `perm` and materialize a range of type +/// `RangeT`. Generic over the inner-cell range types regime-A einsum sees: +/// `TA::Range` (legacy `Tensor` inners) and `btas::zb::RangeNd` +/// (`Tensor` inners). `Permutation * Range` only exists for +/// `TA::Range`, so the permutation is applied to a plain extent vector and +/// the target range is rebuilt from the result. +template +RangeT arena_make_permuted_range(const TiledArray::Permutation& perm, + const SrcRange& src) { + const std::size_t rank = src.rank(); + const auto& src_ext = src.extent(); + container::svector ext(rank); + for (std::size_t d = 0; d < rank; ++d) + ext[d] = static_cast(src_ext[d]); + if (perm && !perm.is_identity()) { + TA_ASSERT(perm.size() == rank); + return RangeT(perm * ext); + } + return RangeT(ext); +} + +/// Holds the inner operation plan for arena regime-A dispatch. +template +struct RegimeAArenaPlan { + using Annot = ::Einsum::Index; + + bool active = false; + RegimeAInnerKind kind = RegimeAInnerKind::hadamard; + + // Exactly one plan optional is engaged; optionals avoid default construction. + std::optional> h_plan{}; + std::optional> c_plan{}; + + /// Derives the result inner range from a non-empty input-cell pair. + template + InnerRange derive_inner_range(const LRange& l_range, + const RRange& r_range) const { + switch (kind) { + case RegimeAInnerKind::hadamard: + TA_ASSERT(h_plan.has_value()); + return arena_make_permuted_range(h_plan->perm.AC, l_range); + case RegimeAInnerKind::contraction: { + TA_ASSERT(c_plan.has_value()); + const auto& p = *c_plan; + using PlanIndices = std::remove_cvref_t; + using PlanIndex = typename PlanIndices::value_type; + using Extent = + std::remove_cv_t() + .extent())::value_type>; + using ExtentMap = ::Einsum::index::IndexMap; + ExtentMap extent = (ExtentMap{p.A, l_range.extent()} | + ExtentMap{p.B, r_range.extent()}); + container::vector rng; + rng.reserve(p.e.size()); + for (auto&& ix : p.e) rng.emplace_back(extent[ix]); + return InnerRange(rng); + } + case RegimeAInnerKind::scale_left: + // Scale-left preserves the ToT operand's inner range. + return InnerRange(l_range); + case RegimeAInnerKind::scale_right: + return InnerRange(r_range); + } + TA_ASSERT(false && "RegimeAInnerKind: unhandled kind"); + return InnerRange{}; + } + + /// Accumulates one input-cell pair into the result cell. + template + void accumulate(ResultCell& r, const LCell& l, const RCell& rr) const { + switch (kind) { + case RegimeAInnerKind::hadamard: { + if constexpr (is_arena_inner_cell_v && + is_arena_inner_cell_v) { + if (l.empty() || rr.empty()) return; + TA_ASSERT(h_plan.has_value()); + // run_regime_a_arena has already hoisted any operand inner + // permutation, so l and rr are both in C-layout: the per-cell op + // is a flat r += l * rr on congruent cells. + fused_hadamard_inplace(r, l, rr); + } + return; + } + case RegimeAInnerKind::contraction: { + if constexpr (is_arena_inner_cell_v && + is_arena_inner_cell_v) { + if (l.empty() || rr.empty()) return; + TA_ASSERT(c_plan.has_value()); + // run_regime_a_arena has already hoisted any operand inner + // permutation, so l and rr are in canonical (blas_layout) order: + // the per-cell op is a single canonical GEMM into r with beta=1. + // Uniform for TA::Tensor and ArenaTensor cells (free `gemm` CPO). + using Scalar = typename std::remove_cv_t::numeric_type; + fused_contraction_inplace(r, l, rr, Scalar{1}, c_plan->gemm_helper); + } + return; + } + case RegimeAInnerKind::scale_left: { + // Scale-left receives a ToT inner cell and a scalar. + if constexpr (is_arena_inner_cell_v && + !is_arena_inner_cell_v) { + if (l.empty()) return; + fused_scale_tot_x_t_inplace(r, l, rr); + } + return; + } + case RegimeAInnerKind::scale_right: { + if constexpr (!is_arena_inner_cell_v && + is_arena_inner_cell_v) { + if (rr.empty()) return; + fused_scale_t_x_tot_inplace(r, l, rr); + } + return; + } + } + } +}; + +/// Builds an arena regime-A plan when result and permutation constraints allow +/// it. +template +auto make_regime_a_arena_plan(const A& a, const B& b, const Inner& inner, + const PermT& inner_perm) + -> RegimeAArenaPlan { + using Plan = RegimeAArenaPlan; + Plan plan; + if (arena_disabled()) return plan; + if constexpr (!is_arena_eligible_outer_v) { + return plan; + } else { + // `inner_perm` (== C.permutation at the call site) is the result *outer* + // permutation. run_regime_a_arena applies it itself via tile.permute(pc) + // -- byte-identical to the legacy non-arena path, and supported for an + // arena ToT via arena_permute_shallow -- so it does not gate the plan. + // Inner-operand and inner-result permutations are likewise handled, by + // hoisting them to slab-level arena_inner_permute rewrites (see below). + (void)inner_perm; + + using ArrayA_t = std::remove_cvref_t; + using ArrayB_t = std::remove_cvref_t; + // "Tot" here means "tile is a ToT-like thing whose inner cell is the + // tensor we want to operate on"; covers both legacy TA::Tensor inners + // and pinned ArenaTensor inners. + constexpr bool a_is_tot = + is_arena_eligible_outer_v; + constexpr bool b_is_tot = + is_arena_eligible_outer_v; + + if constexpr (a_is_tot && b_is_tot) { + if (static_cast(inner.h)) { + plan.kind = RegimeAInnerKind::hadamard; + plan.h_plan.emplace(inner.A, inner.B, inner.C); + // A non-canonical inner Hadamard (h_plan.perm.{AC,BC} non-identity) + // is handled the same way as a non-canonical inner contraction: + // run_regime_a_arena hoists each operand inner permutation to a + // slab-level rewrite (arena_inner_permute) so both operands reach + // C-layout before the per-cell flat r += l * rr. No need to bail. + } else { + plan.kind = RegimeAInnerKind::contraction; + plan.c_plan.emplace(inner.A, inner.B, inner.C); + // A non-canonical inner contraction (c_plan.do_perm.{A,B,C} set -- + // e.g. M/K- or M/N-interleaved inner annotations that are not + // GEMM-absorbable transposes) is still handled: run_regime_a_arena + // hoists each operand inner permutation, and the result inner + // permutation, to slab-level rewrites (arena_inner_permute), leaving + // the per-cell op a single canonical GEMM. No need to bail here. + } + } else if constexpr (a_is_tot && !b_is_tot) { + plan.kind = RegimeAInnerKind::scale_left; + } else if constexpr (!a_is_tot && b_is_tot) { + plan.kind = RegimeAInnerKind::scale_right; + } else { + return plan; + } + plan.active = true; + (void)a; + (void)b; + return plan; + } +} + +/// Runs the arena regime-A path for one H-slice when the plan is active. +template +bool run_regime_a_arena(const Plan& plan, const HIndex& h, std::size_t batch, + const TermA& A, const TermB& B, const TermC& C, + LocalTiles& C_local_tiles, const Tiles& tiles, + const Trange& trange) { + if (!plan.active) return false; + + using ResultTensor = typename LocalTiles::value_type::second_type; + // Guard avoids naming inner-cell APIs for non-ToT instantiations. + using ArrayA_t = std::remove_cvref_t; + using ArrayB_t = std::remove_cvref_t; + // ToT-like in the regime-A sense: tile is an arena-eligible outer + // (legacy TA::Tensor inner or pinned ArenaTensor inner). + constexpr bool a_is_tot = + is_arena_eligible_outer_v; + constexpr bool b_is_tot = + is_arena_eligible_outer_v; + if constexpr (!is_arena_eligible_outer_v || + (!a_is_tot && !b_is_tot)) { + (void)h; + (void)batch; + (void)A; + (void)B; + (void)C; + (void)C_local_tiles; + (void)tiles; + (void)trange; + return false; + } else { + using InnerT = typename ResultTensor::value_type; + using InnerRange = typename InnerT::range_type; + + const auto& pa = A.permutation; + const auto& pb = B.permutation; + const auto& pc = C.permutation; + auto const c = apply(pc, h); + + if constexpr (a_is_tot && b_is_tot) { + using IIndex = ::Einsum::index::Index; + auto range_for = [&](std::size_t k) -> InnerRange { + if (k >= batch) return InnerRange{}; + for (IIndex i : tiles) { + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + for (decltype(vol) j = 0; j < vol; ++j) { + const auto& l_inner = aik.data()[j]; + const auto& r_inner = bik.data()[j]; + if (l_inner.empty() || r_inner.empty()) continue; + return plan.template derive_inner_range( + l_inner.range(), r_inner.range()); + } + } + return InnerRange{}; + }; + + ResultTensor tile = arena_outer_init( + TiledArray::Range{batch}, /*batch_sz=*/1, range_for); + + for (IIndex i : tiles) { + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + // Hoist a non-canonical inner op's operand inner permutations to + // slab-level rewrites, so the per-cell op below stays canonical: + // contraction -> a single canonical GEMM; Hadamard -> a flat + // r += l * rr on congruent C-layout cells. No per-cell view permute. + if (plan.kind == RegimeAInnerKind::contraction) { + const auto& cp = *plan.c_plan; + if (cp.do_perm.A) + ai = arena_inner_permute(ai, cp.perm.A); + if (cp.do_perm.B) + bi = arena_inner_permute(bi, cp.perm.B); + } else if (plan.kind == RegimeAInnerKind::hadamard) { + const auto& hp = *plan.h_plan; + if (!hp.perm.AC.is_identity()) + ai = arena_inner_permute(ai, hp.perm.AC); + if (!hp.perm.BC.is_identity()) + bi = arena_inner_permute(bi, hp.perm.BC); + } + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (std::size_t k = 0; k < batch; ++k) { + auto& cell = tile({k}); + if (cell.empty()) continue; + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + for (decltype(vol) j = 0; j < vol; ++j) { + const auto& l_inner = aik.data()[j]; + const auto& r_inner = bik.data()[j]; + plan.accumulate(cell, l_inner, r_inner); + } + } + } + + // Hoist the result inner permutation: cells were accumulated in + // blas_layout (e) order; rewrite the slab to the C inner order. + if (plan.kind == RegimeAInnerKind::contraction && plan.c_plan->do_perm.C) + tile = + arena_inner_permute(tile, plan.c_plan->perm.C.inv()); + auto shape = apply_inverse(pc, C.array.trange().tile(c)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C_local_tiles.emplace_back(std::move(c), std::move(tile)); + return true; + } else { + // Scale path has exactly one ToT operand and one scalar-cell operand. + using IIndex = ::Einsum::index::Index; + auto range_for = [&](std::size_t k) -> InnerRange { + if (k >= batch) return InnerRange{}; + for (IIndex i : tiles) { + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + auto aik = ai.batch(k); + auto bik = bi.batch(k); + if constexpr (a_is_tot) { + auto vol = aik.total_size(); + for (decltype(vol) j = 0; j < vol; ++j) { + const auto& l_inner = aik.data()[j]; + if (l_inner.empty()) continue; + return InnerRange(l_inner.range()); + } + } else { + auto vol = bik.total_size(); + for (decltype(vol) j = 0; j < vol; ++j) { + const auto& r_inner = bik.data()[j]; + if (r_inner.empty()) continue; + return InnerRange(r_inner.range()); + } + } + } + return InnerRange{}; + }; + + ResultTensor tile = arena_outer_init( + TiledArray::Range{batch}, /*batch_sz=*/1, range_for); + + for (IIndex i : tiles) { + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (std::size_t k = 0; k < batch; ++k) { + auto& cell = tile({k}); + if (cell.empty()) continue; + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + for (decltype(vol) j = 0; j < vol; ++j) { + const auto& l_elem = aik.data()[j]; + const auto& r_elem = bik.data()[j]; + plan.accumulate(cell, l_elem, r_elem); + } + } + } + + auto shape = apply_inverse(pc, C.array.trange().tile(c)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C_local_tiles.emplace_back(std::move(c), std::move(tile)); + return true; + } + } +} + +} // namespace TiledArray::detail + +#endif // TILEDARRAY_TENSOR_ARENA_EINSUM_H__INCLUDED diff --git a/src/TiledArray/tensor/arena_kernels.h b/src/TiledArray/tensor/arena_kernels.h new file mode 100644 index 0000000000..8dcd97c870 --- /dev/null +++ b/src/TiledArray/tensor/arena_kernels.h @@ -0,0 +1,442 @@ +/// Arena kernels for tensor-of-tensor (ToT) outer tiles. +/// +/// One slab-backed builder family, dispatching on the inner-tile type: +/// - `ArenaTensor` inners -> slab of `Cell`s (range header + element data), +/// each inner cell is an 8-byte view; +/// - `TA::Tensor` inners -> slab of element data, each inner `Tensor` +/// aliases its slice of the slab. +/// `is_arena_tensor_v` selects the per-cell layout; everything else +/// (planning, allocation, outer-tile assembly) is shared. + +#ifndef TILEDARRAY_TENSOR_ARENA_KERNELS_H__INCLUDED +#define TILEDARRAY_TENSOR_ARENA_KERNELS_H__INCLUDED + +#include "TiledArray/config.h" +#include "TiledArray/error.h" +#include "TiledArray/tensor/arena.h" +#include "TiledArray/tensor/arena_tensor.h" + +#include +#include +#include +#include +#include +#include + +namespace TiledArray { +namespace detail { + +namespace { + +/// Build outer storage whose deleter owns arena and alias keep-alive state. +template +std::shared_ptr make_outer_data( + std::size_t n_cells, std::shared_ptr arena_handle, + KeepAlive keep_alive) { + using inner_t = typename OuterTensor::value_type; + std::allocator allocator; + inner_t* raw = allocator.allocate(n_cells); + auto deleter = + [allocator = std::move(allocator), arena_handle = std::move(arena_handle), + keep_alive = std::move(keep_alive), n_cells](inner_t* p) mutable { + for (std::size_t i = 0; i < n_cells; ++i) (p + i)->~inner_t(); + allocator.deallocate(p, n_cells); + (void)arena_handle; + (void)keep_alive; + }; + return std::shared_ptr(raw, std::move(deleter)); +} + +} // namespace + +/// Allocate a slab-backed ToT outer tile with caller-provided inner ranges. +/// +/// `inner_range_fn(cell_ordinal)` -> inner `range_type` for each cell ordinal +/// in `[0, outer_range.volume() * batch_sz)`; a zero-volume range yields a +/// deliberately-null inner cell that consumes no slab bytes. Element storage +/// is left zero-initialized when `zero_init` is true. `cell_stride_align` is +/// the minimum byte stride between adjacent cells; it is bumped up to the +/// inner type's natural alignment (`ArenaTensor::cell_alignment()`, or +/// `alignof(T)` for `TA::Tensor` inners). +template +OuterTensor arena_outer_init( + const typename OuterTensor::range_type& outer_range, std::size_t batch_sz, + InnerRangeFn&& inner_range_fn, + std::size_t cell_stride_align = kArenaCachelineAlign, + bool zero_init = true) { + using InnerT = typename OuterTensor::value_type; + using T = typename InnerT::value_type; + using InnerRange = typename InnerT::range_type; + constexpr bool arena = is_arena_tensor_v; + + std::size_t stride = cell_stride_align; + if constexpr (arena) { + if (InnerT::cell_alignment() > stride) stride = InnerT::cell_alignment(); + } else { + if (alignof(T) > stride) stride = alignof(T); + } + // Cells pack at `stride` granularity, but the slab base handed to + // `Arena::reserve` must be at least `max_align_t`-aligned. + const std::size_t slab_align = + stride > alignof(std::max_align_t) ? stride : alignof(std::max_align_t); + + const std::size_t N_cells = outer_range.volume() * batch_sz; + constexpr std::size_t kNull = static_cast(-1); + std::vector ranges; + ranges.reserve(N_cells); + std::vector offsets(N_cells, 0); + std::size_t total = 0; + for (std::size_t ord = 0; ord < N_cells; ++ord) { + ranges.emplace_back(inner_range_fn(ord)); + const std::size_t vol = ranges.back().volume(); + if (vol == 0) { + offsets[ord] = kNull; + } else { + offsets[ord] = total; + // `if constexpr`, not a ternary: `InnerT::cell_size` does not exist for + // a `TA::Tensor` inner, so the non-arena branch must not be formed. + std::size_t bytes; + if constexpr (arena) + bytes = InnerT::cell_size(vol); + else + bytes = vol * sizeof(T); + total += arena_align_up(bytes, stride); + } + } + + auto arena_slab = std::make_shared(); + if (total > 0) arena_slab->reserve(total, zero_init, slab_align); + auto data = make_outer_data(N_cells, arena_slab, + std::shared_ptr{}); + OuterTensor result(outer_range, batch_sz, std::move(data)); + + for (std::size_t ord = 0; ord < N_cells; ++ord) { + auto& r = ranges[ord]; + if (offsets[ord] == kNull) { + if constexpr (arena) { + ::new (result.data() + ord) InnerT(); + } else { + // Rank-0 empties must preserve Tensor's null-data/no-range invariant. + if (r.rank() == 0) + ::new (result.data() + ord) InnerT(); + else + ::new (result.data() + ord) InnerT(r); + } + } else if constexpr (arena) { + // slice(offset, 1) returns an aliased shared_ptr; we only + // need its raw pointer to placement-new the Cell -- the slab's lifetime + // is held by `arena_handle` captured in the outer's deleter. + auto byte_view = arena_slab->template slice(offsets[ord], 1); + ::new (result.data() + ord) + InnerT(make_arena_tensor_in(byte_view.get(), std::move(r))); + } else { + auto elem_data = arena_slab->template slice(offsets[ord], r.volume()); + ::new (result.data() + ord) InnerT(r, std::move(elem_data)); + } + } + return result; +} + +/// Default (no-op) fill for `make_nested_tile` -- leaves element storage +/// zero-initialized. +struct nested_fill_noop { + template + void operator()(Cell&, const Index&) const noexcept {} +}; + +/// Build one ToT outer tile over `outer_range`, two-pass: +/// pass 1: `inner_range_fn(outer_element_index)` -> inner `range_type` +/// sizes every inner cell (zero-volume -> deliberately-null cell); +/// pass 2: `inner_fill_fn(inner_cell&, outer_element_index)` fills each +/// non-null cell. The default fill leaves storage zero-initialized. +/// Dispatches internally on the inner-tile type (see `arena_outer_init`). +template +OuterTensor make_nested_tile( + const typename OuterTensor::range_type& outer_range, + InnerRangeFn&& inner_range_fn, InnerFillFn&& inner_fill_fn = {}) { + // arena_outer_init keys ranges on the cell ordinal; user code keys on the + // (global) outer element index -- translate via the outer range. + auto cell_range_fn = [&](std::size_t ord) { + return inner_range_fn(outer_range.idx(ord)); + }; + OuterTensor result = + arena_outer_init(outer_range, 1, cell_range_fn); + const std::size_t N = outer_range.volume(); + for (std::size_t ord = 0; ord < N; ++ord) { + auto& cell = result.data()[ord]; + if (!cell.empty()) inner_fill_fn(cell, outer_range.idx(ord)); + } + return result; +} + +/// Apply a unary fill op while preserving each source inner range. +/// `fill_op(dst_data, src_data, n_elements)` writes the result cell. +template +OuterTensor arena_trivial_unary(const SrcOuterTensor& src, FillOp&& fill_op) { + using elem_t = typename OuterTensor::value_type::value_type; + using inner_range_t = typename OuterTensor::value_type::range_type; + // A null inner cell has no range to query (`ArenaTensor::range()` asserts + // non-null); map it to a default range -> a null result cell. + auto range_fn = [&src](std::size_t ord) -> inner_range_t { + const auto& s = src.data()[ord]; + return s.empty() ? inner_range_t{} : s.range(); + }; + // Elementwise kernels pack tight (no cross-cell GEMM to amortize padding); + // the fill overwrites every element, so the slab need not be zero-init'd. + OuterTensor result = arena_outer_init(src.range(), src.nbatch(), + range_fn, alignof(elem_t), + /*zero_init=*/false); + const std::size_t N_cells = src.range().volume() * src.nbatch(); + for (std::size_t ord = 0; ord < N_cells; ++ord) { + auto& dst = result.data()[ord]; + if (dst.empty()) continue; + fill_op(dst.data(), src.data()[ord].data(), dst.size()); + } + return result; +} + +/// Apply a binary fill op using the left operand's inner ranges (asserted +/// equal to the right's per cell). `fill_op(dst, l, r, n_elements)`. +template +OuterTensor arena_trivial_binary(const LeftTensor& left, + const RightTensor& right, FillOp&& fill_op) { + using elem_t = typename OuterTensor::value_type::value_type; + using inner_range_t = typename OuterTensor::value_type::range_type; + TA_ASSERT(left.range().volume() == right.range().volume()); + TA_ASSERT(left.nbatch() == right.nbatch()); + auto range_fn = [&left](std::size_t ord) -> inner_range_t { + const auto& l = left.data()[ord]; + return l.empty() ? inner_range_t{} : l.range(); + }; + OuterTensor result = arena_outer_init( + left.range(), left.nbatch(), range_fn, alignof(elem_t), + /*zero_init=*/false); + const std::size_t N_cells = left.range().volume() * left.nbatch(); + for (std::size_t ord = 0; ord < N_cells; ++ord) { + auto& dst = result.data()[ord]; + if (dst.empty()) continue; + TA_ASSERT(left.data()[ord].size() == right.data()[ord].size()); + TA_ASSERT(left.data()[ord].size() == dst.size()); + fill_op(dst.data(), left.data()[ord].data(), right.data()[ord].data(), + dst.size()); + } + return result; +} + +/// Trivial mixed scalar/ToT outer-Hadamard kernel: `tot_outer` drives the +/// result's outer and per-cell inner ranges; `scalar_outer` supplies one +/// scalar per outer cell. `fill_op(dst, tot_data, scalar_value, n_elements)`. +template +OuterTensor arena_trivial_scaled(const ToTSide& tot_outer, + const ScalarSide& scalar_outer, + FillOp&& fill_op) { + using elem_t = typename OuterTensor::value_type::value_type; + using inner_range_t = typename OuterTensor::value_type::range_type; + TA_ASSERT(tot_outer.range().volume() == scalar_outer.range().volume()); + TA_ASSERT(tot_outer.nbatch() == scalar_outer.nbatch()); + auto range_fn = [&tot_outer](std::size_t ord) -> inner_range_t { + const auto& t = tot_outer.data()[ord]; + return t.empty() ? inner_range_t{} : t.range(); + }; + OuterTensor result = arena_outer_init( + tot_outer.range(), tot_outer.nbatch(), range_fn, alignof(elem_t), + /*zero_init=*/false); + const std::size_t N_cells = tot_outer.range().volume() * tot_outer.nbatch(); + for (std::size_t ord = 0; ord < N_cells; ++ord) { + auto& dst = result.data()[ord]; + if (dst.empty()) continue; + fill_op(dst.data(), tot_outer.data()[ord].data(), scalar_outer.data()[ord], + dst.size()); + } + return result; +} + +/// Grow `result` in place so every cell whose current inner cell is null but +/// `more_range_fn(cell_ordinal)` yields a non-empty range becomes an +/// allocated, zero-initialized cell. Data already accumulated in non-empty +/// cells is preserved -- a fresh slab is built and the old cell data copied +/// over. A no-op (no reallocation) when nothing grows, so the steady-state +/// path stays cheap. Used by the SUMMA ToT contraction, which shapes a result +/// tile from its first K-panel only and must extend it for later panels of a +/// contracted-dimension-sparse ToT operand. +template +void arena_tot_grow_inplace(OuterTensor& result, MoreRangeFn&& more_range_fn) { + using inner_t = typename OuterTensor::value_type; + using elem_t = typename inner_t::value_type; + using inner_range_t = typename inner_t::range_type; + const std::size_t N_cells = result.range().volume() * result.nbatch(); + std::vector ranges; + ranges.reserve(N_cells); + bool grows = false; + for (std::size_t ord = 0; ord < N_cells; ++ord) { + const auto& rc = result.data()[ord]; + if (!rc.empty()) { + ranges.emplace_back(rc.range()); + continue; + } + inner_range_t r = more_range_fn(ord); + if (r.volume() != 0) grows = true; + ranges.emplace_back(std::move(r)); + } + if (!grows) return; + OuterTensor grown = arena_outer_init( + result.range(), result.nbatch(), + [&ranges](std::size_t ord) -> inner_range_t { return ranges[ord]; }); + for (std::size_t ord = 0; ord < N_cells; ++ord) { + const auto& src = result.data()[ord]; + if (src.empty()) continue; + auto& dst = grown.data()[ord]; + TA_ASSERT(!dst.empty() && dst.size() == src.size()); + const elem_t* s = src.data(); + elem_t* d = dst.data(); + for (std::size_t i = 0; i < src.size(); ++i) d[i] = s[i]; + } + result = std::move(grown); +} + +/// Accumulate `arg` into `result` (`result += arg`), first growing `result` +/// to the union of the two tiles' inner-cell sparsity. Either tile may be +/// outer-empty. Used to combine two partial contraction results whose +/// disjoint K-panel subsets induced different inner-cell sparsity. +template +void arena_tot_add_to(OuterTensor& result, const OuterTensor& arg) { + using inner_t = typename OuterTensor::value_type; + using elem_t = typename inner_t::value_type; + using inner_range_t = typename inner_t::range_type; + if (arg.empty()) return; + auto arg_range_fn = [&arg](std::size_t ord) -> inner_range_t { + const auto& a = arg.data()[ord]; + return a.empty() ? inner_range_t{} : a.range(); + }; + if (result.empty()) { + result = + arena_outer_init(arg.range(), arg.nbatch(), arg_range_fn); + } else { + TA_ASSERT(result.range().volume() == arg.range().volume()); + TA_ASSERT(result.nbatch() == arg.nbatch()); + arena_tot_grow_inplace(result, arg_range_fn); + } + const std::size_t N_cells = arg.range().volume() * arg.nbatch(); + for (std::size_t ord = 0; ord < N_cells; ++ord) { + const auto& src = arg.data()[ord]; + if (src.empty()) continue; + auto& dst = result.data()[ord]; + TA_ASSERT(!dst.empty() && dst.size() == src.size()); + const elem_t* s = src.data(); + elem_t* d = dst.data(); + for (std::size_t i = 0; i < src.size(); ++i) d[i] += s[i]; + } +} + +/// Shallow-permute outer cells while preserving inner storage. The result +/// shares the source's inner storage (arena slab or aliased element data); +/// only the outer-cell array is rebuilt in permuted order. +template +OuterTensor arena_permute_shallow(const SrcOuterTensor& src, const Perm& perm) { + using inner_t = typename OuterTensor::value_type; + TA_ASSERT(perm); + TA_ASSERT(perm.size() == src.range().rank()); + auto perm_range = perm * src.range(); + const std::size_t N_cells = src.range().volume(); + const std::size_t total_cells = N_cells * src.nbatch(); + auto data = make_outer_data( + total_cells, std::make_shared(), src.data_shared()); + OuterTensor result(perm_range, src.nbatch(), std::move(data)); + for (std::size_t s = 0; s < N_cells; ++s) { + auto src_idx = src.range().idx(s); + auto tgt_ord = perm_range.ordinal(perm * src_idx); + for (std::size_t b = 0; b < src.nbatch(); ++b) { + const std::size_t s_off = b * N_cells + s; + const std::size_t t_off = b * N_cells + tgt_ord; + const inner_t& src_inner = src.data()[s_off]; + if constexpr (is_arena_tensor_v) { + // The view is 8 bytes; copy rebinds it to the same Cell. The source's + // arena is kept alive by the keep-alive captured in the deleter. + ::new (result.data() + t_off) inner_t(src_inner); + } else { + auto src_inner_data = const_cast(src_inner).data_shared(); + ::new (result.data() + t_off) inner_t( + src_inner.range(), src_inner.nbatch(), std::move(src_inner_data)); + } + } + } + return result; +} + +/// Permute the inner modes of every cell of a slab-backed ToT outer tile. +/// +/// Produces a fresh slab-backed tile with the same outer layout as `src`, +/// but with each inner cell's range and data permuted by `inner_perm` +/// (`result_cell(inner_perm * i) == src_cell(i)`). This is the slab-level +/// counterpart of a per-cell permute: the owning tile allocates one new +/// slab and rewrites every cell, so no view inner cell is ever asked to +/// value-return. `inner_perm` is a plain (non-bipartite) permutation whose +/// rank matches the inner-cell rank. +template +OuterTensor arena_inner_permute(const SrcOuterTensor& src, + const Perm& inner_perm) { + using inner_t = typename OuterTensor::value_type; + using elem_t = typename inner_t::value_type; + using inner_range_t = typename inner_t::range_type; + TA_ASSERT(inner_perm); + const std::size_t rank = inner_perm.size(); + + // result cell range = inner_perm applied to the src cell range; a null + // src cell maps to a default (null) range -> a null result cell. + auto range_fn = [&src, &inner_perm, rank](std::size_t ord) -> inner_range_t { + const auto& s = src.data()[ord]; + if (s.empty()) return inner_range_t{}; + TA_ASSERT(static_cast(s.range().rank()) == rank); + const auto& se = s.range().extent(); + std::vector ext(rank); + for (std::size_t d = 0; d < rank; ++d) + ext[d] = static_cast(se[d]); + return inner_range_t(inner_perm * ext); + }; + // The permute writes every result element exactly once, so no zero-init. + OuterTensor result = arena_outer_init(src.range(), src.nbatch(), + range_fn, alignof(elem_t), + /*zero_init=*/false); + + const std::size_t N_cells = src.range().volume() * src.nbatch(); + // Per-cell scratch (rank is fixed across cells); reused, not reallocated. + std::vector dstride(rank), w(rank), ctr(rank); + for (std::size_t ord = 0; ord < N_cells; ++ord) { + auto& dst = result.data()[ord]; + if (dst.empty()) continue; + const auto& s = src.data()[ord]; + const auto& se = s.range().extent(); + const auto& de = dst.range().extent(); + // row-major strides of the (permuted) destination cell + dstride[rank - 1] = 1; + for (std::size_t d = rank - 1; d > 0; --d) + dstride[d - 1] = dstride[d] * static_cast(de[d]); + // w[d] = destination stride contributed by source dimension d, since + // source dim d maps to destination dim inner_perm[d]. + for (std::size_t d = 0; d < rank; ++d) + w[d] = dstride[static_cast(inner_perm[d])]; + // walk the source cell in row-major order, scattering into the dst cell + ctr.assign(rank, 0); + const std::size_t vol = s.size(); + const elem_t* sd = s.data(); + elem_t* dd = dst.data(); + for (std::size_t so = 0; so < vol; ++so) { + std::size_t dofs = 0; + for (std::size_t d = 0; d < rank; ++d) dofs += w[d] * ctr[d]; + dd[dofs] = sd[so]; + for (std::size_t d = rank; d-- > 0;) { + if (++ctr[d] < static_cast(se[d])) break; + ctr[d] = 0; + } + } + } + return result; +} + +} // namespace detail +} // namespace TiledArray + +#endif // TILEDARRAY_TENSOR_ARENA_KERNELS_H__INCLUDED diff --git a/src/TiledArray/tensor/arena_tensor.h b/src/TiledArray/tensor/arena_tensor.h new file mode 100644 index 0000000000..b4c3d4959e --- /dev/null +++ b/src/TiledArray/tensor/arena_tensor.h @@ -0,0 +1,535 @@ +/// ToT inner-tile type: pimpl-style pinned tensor backed by an arena cell. +/// +/// `ArenaTensor` is one pointer wide. Its referent is a `Cell` +/// (range header + co-located element storage, aligned for both) that the +/// outer tile's arena allocates and owns. The `ArenaTensor` itself is +/// non-owning; copies/moves rebind the pointer. Lifetime is bounded by the +/// outer tile that owns the arena slab. + +#ifndef TILEDARRAY_TENSOR_ARENA_TENSOR_H__INCLUDED +#define TILEDARRAY_TENSOR_ARENA_TENSOR_H__INCLUDED + +#include "TiledArray/error.h" +#include "TiledArray/math/blas.h" +#include "TiledArray/math/gemm_helper.h" +#include "TiledArray/tensor/type_traits.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace TiledArray { + +/// Alignment of in-arena element storage, in bytes. Sized to cover the +/// widest common SIMD register (AVX-512 ZMM = 64 B) and a single x86_64 +/// cache line. Override at configure time by defining +/// TILEDARRAY_INNER_SIMD_ALIGN to a larger power-of-two (e.g. 128 for +/// two-cache-line floor / Apple-Silicon L1 line size). +#ifndef TILEDARRAY_INNER_SIMD_ALIGN +#define TILEDARRAY_INNER_SIMD_ALIGN 64 +#endif + +inline constexpr std::size_t kInnerSimdAlign = TILEDARRAY_INNER_SIMD_ALIGN; +static_assert((kInnerSimdAlign & (kInnerSimdAlign - 1)) == 0, + "kInnerSimdAlign must be a power of two"); + +template > +class ArenaTensor; + +// Forward decls of the free in-place CPOs (defined below). Needed so the +// member compound operators and member in-place CPOs on `ArenaTensor` can +// reference them. +template +void scale_to(ArenaTensor& dst, Scalar factor); +template +void add_to(ArenaTensor& dst, const ArenaTensor& src); +template +void subt_to(ArenaTensor& dst, const ArenaTensor& src); +template +void mult_to(ArenaTensor& dst, const ArenaTensor& src); +template +void axpy_to(ArenaTensor& dst, const ArenaTensor& src, + Scalar alpha); + +template +class ArenaTensor { + public: + using value_type = T; + using numeric_type = typename detail::numeric_type::type; + using scalar_type = typename detail::scalar_type::type; + using range_type = Range_; + using pointer = T*; + using const_pointer = const T*; + using reference = T&; + using const_reference = const T&; + using size_type = std::size_t; + + /// In-arena layout: range header, then padding, then element storage. + struct Cell { + range_type range; + }; + + /// Alignment of the element pointer past the cell header. Caller-owned + /// arena slots must honour this so SIMD loads/stores on `data()` are + /// aligned without an extra runtime check. + static constexpr size_type data_alignment() noexcept { + return alignof(T) > kInnerSimdAlign ? alignof(T) : kInnerSimdAlign; + } + + /// Offset (in bytes) of the first element past the cell header. + static constexpr size_type data_offset() noexcept { + constexpr size_type a = data_alignment(); + return (sizeof(Cell) + a - 1) & ~(a - 1); + } + + /// Total bytes a cell holding `n` elements consumes in the arena. + static constexpr size_type cell_size(size_type n) noexcept { + return data_offset() + n * sizeof(T); + } + + /// Required alignment of a cell allocation. At least `data_alignment()` + /// so that `cell_base + data_offset()` lands on a SIMD boundary, and at + /// least `alignof(Cell)` so the range header is well-aligned. + static constexpr size_type cell_alignment() noexcept { + constexpr size_type da = data_alignment(); + return alignof(Cell) > da ? alignof(Cell) : da; + } + + ArenaTensor() = default; + ArenaTensor(const ArenaTensor&) = default; + /// Move construction transfers the view and leaves the source null. + ArenaTensor(ArenaTensor&& other) noexcept : cell_(other.cell_) { + other.cell_ = nullptr; + } + ~ArenaTensor() = default; + + /// Unified assignment, with two regimes keyed on whether `*this` is bound: + /// - bound (non-null) assignee: deep element-wise copy from `src` -- the + /// view's storage already exists, so assignment writes into it; + /// - null assignee: a shallow rebind of the view to `src`'s cell -- there + /// is no storage to deep-copy into. + /// This must be a user-provided non-template operator: the implicit + /// copy-assignment (a shallow pointer copy) would otherwise be generated + /// and, as a non-template exact match, would always shadow the templated + /// `operator=` below for `ArenaTensor` sources. There is deliberately no + /// move-assignment -- an rvalue `ArenaTensor` binds here and follows the + /// same two regimes (moving a view buys nothing over copying it). + ArenaTensor& operator=(const ArenaTensor& src) { + if (cell_ == nullptr) { + cell_ = src.cell_; // null assignee: rebind the view (shallow) + return *this; + } + return assign_elements_(src); // bound assignee: deep copy + } + + /// Construct a view onto a `Cell` (placement-newed by the arena factory). + explicit ArenaTensor(Cell* cell) noexcept : cell_(cell) {} + + /// True if the view points at a non-null cell. + explicit operator bool() const noexcept { return cell_ != nullptr; } + + /// True if the view is null (no cell). + bool empty() const noexcept { return cell_ == nullptr; } + + /// Range of the referenced cell. UB if null. + const range_type& range() const noexcept { + TA_ASSERT(cell_ != nullptr); + return cell_->range; + } + + /// Pointer to the first element. Null when the view is null. + pointer data() noexcept { + if (cell_ == nullptr) return nullptr; + auto* base = reinterpret_cast(cell_); + return std::launder(reinterpret_cast(base + data_offset())); + } + + const_pointer data() const noexcept { + if (cell_ == nullptr) return nullptr; + auto* base = reinterpret_cast(cell_); + return std::launder(reinterpret_cast(base + data_offset())); + } + + /// Element count of the referenced cell, or 0 if null. + size_type size() const noexcept { + return cell_ != nullptr ? cell_->range.volume() : 0; + } + + reference operator[](size_type i) noexcept { + TA_ASSERT(cell_ != nullptr); + return data()[i]; + } + const_reference operator[](size_type i) const noexcept { + TA_ASSERT(cell_ != nullptr); + return data()[i]; + } + + /// Sum of all elements; `value_type{}` for a null view. A scalar + /// reduction allocates nothing, so it is valid on a view (unlike the + /// value-returning tensor ops, which are deliberately absent). + value_type sum() const noexcept { + value_type acc{}; + if (cell_ == nullptr) return acc; + const auto* s = data(); + for (size_type i = 0; i < size(); ++i) acc += s[i]; + return acc; + } + + /// Element-wise deep copy from a non-`ArenaTensor` tensor `src`. Valid only + /// for a bound (non-null) assignee: a null view has no storage to copy into + /// and a non-view `src` has no cell to rebind to (use the `ArenaTensor` + /// overload above for the rebind regime). + template && + !std::is_same_v>> + ArenaTensor& operator=(const Src& src) { + TA_ASSERT(cell_ != nullptr && + "cannot assign a non-ArenaTensor source to a null ArenaTensor"); + return assign_elements_(src); + } + + /// In-place compound operators -- ArenaTensor is a view (no allocation), + /// so it provides only the *mutating* counterparts to the value-returning + /// `+`, `-`, `*` operators. Each delegates to the same-named free CPO + /// (forward-declared above, defined later in this header). The pair + /// (ArenaTensor x ArenaTensor) is the only one needed by TA's kernel + /// paths. Calls are fully-qualified to avoid recursing into the member + /// overloads of the same names below. + ArenaTensor& operator+=(const ArenaTensor& other) { + ::TiledArray::add_to(*this, other); + return *this; + } + ArenaTensor& operator-=(const ArenaTensor& other) { + ::TiledArray::subt_to(*this, other); + return *this; + } + ArenaTensor& operator*=(const ArenaTensor& other) { + ::TiledArray::mult_to(*this, other); + return *this; + } + // Scalar `*=` is intentionally not a member: the free `operator*=(T&&, N)` + // in operators_body.ipp already covers `view *= scalar`. A member template + // alongside it ties under gcc-13's overload resolution (ambiguous), so the + // free operator is the single provider of arena-cell `*= scalar`. + + /// Member-call mirrors of the free in-place CPOs. Tile-interface paths + /// (`add_to(result, arg)`, `subt_to`, etc.) and `Tensor`'s legacy + /// `inplace_binary` use these. Bodies fully-qualify the free CPO call so + /// the member doesn't recurse into itself. + ArenaTensor& add_to(const ArenaTensor& other) { + ::TiledArray::add_to(*this, other); + return *this; + } + ArenaTensor& subt_to(const ArenaTensor& other) { + ::TiledArray::subt_to(*this, other); + return *this; + } + ArenaTensor& mult_to(const ArenaTensor& other) { + ::TiledArray::mult_to(*this, other); + return *this; + } + template + requires(detail::is_numeric_v) + ArenaTensor& scale_to(const Scalar factor) { + ::TiledArray::scale_to(*this, factor); + return *this; + } + ArenaTensor& neg_to() { + ::TiledArray::scale_to(*this, -T(1)); + return *this; + } + + /// axpy: *this += other * factor (axpy semantics; factor scales + /// only the added operand). Delegates to the free `axpy` CPO that the + /// outer-cell loop ultimately calls. Distinct from + /// `add_to(other, factor)` which would be the legacy + /// `(*this + other) * factor` semantics -- view tile types don't have + /// `operator+=` returning a value, so we keep the names separated. + template + requires(detail::is_numeric_v) + ArenaTensor& axpy_to(const ArenaTensor& other, const Scalar factor) { + ::TiledArray::axpy_to(*this, other, factor); + return *this; + } + + /// axpy + fused permutation. ArenaTensor is a fixed-layout view, so any + /// non-empty permutation is rejected at runtime. + template + requires(detail::is_numeric_v && detail::is_permutation_v) + ArenaTensor& axpy_to(const ArenaTensor& other, const Scalar factor, + const Perm& perm) { + TA_EXCEPTION( + "ArenaTensor::axpy_to(other, factor, perm): inner permutation is not " + "supported for view cells"); + return *this; + } + + /// Internal accessor for the cell pointer. Used by the arena factory and + /// by destruction walks; not part of the user-facing surface. + Cell* cell() const noexcept { return cell_; } + + private: + /// Deep element-wise copy into this bound view's storage from any tensor + /// `src` of matching volume (an `ArenaTensor` or an owning tensor alike). + template + ArenaTensor& assign_elements_(const Src& src) { + TA_ASSERT(cell_ != nullptr); + TA_ASSERT(size() == static_cast(src.size())); + auto* dst = data(); + const auto* src_data = src.data(); + for (size_type i = 0; i < size(); ++i) dst[i] = src_data[i]; + return *this; + } + + Cell* cell_ = nullptr; +}; + +namespace detail { + +/// Placement-construct an `ArenaTensor` at the given pre-aligned, +/// pre-sized buffer. `buffer` must be at least +/// `ArenaTensor::cell_size(range.volume())` bytes and aligned to +/// `ArenaTensor::cell_alignment()`. Element storage is +/// value-initialized (zero for arithmetic `T`). +template +ArenaTensor make_arena_tensor_in(std::byte* buffer, R range) { + using Inner = ArenaTensor; + using Cell = typename Inner::Cell; + TA_ASSERT(buffer != nullptr); + TA_ASSERT( + reinterpret_cast(buffer) % Inner::cell_alignment() == 0); + const std::size_t n = range.volume(); + Cell* cell = ::new (static_cast(buffer)) Cell{std::move(range)}; + T* elems = reinterpret_cast(buffer + Inner::data_offset()); + if constexpr (std::is_trivially_constructible_v) { + std::memset(elems, 0, n * sizeof(T)); + } else { + for (std::size_t i = 0; i < n; ++i) + ::new (static_cast(elems + i)) T(); + } + return Inner(cell); +} + +/// Destruct in-place. Mirrors `make_arena_tensor_in`'s construction. Safe +/// on a null view (no-op). After this call the cell memory is uninitialized; +/// the arena slab still owns the bytes. +template +void destruct_arena_tensor(ArenaTensor& inner) noexcept { + auto* cell = inner.cell(); + if (cell == nullptr) return; + const std::size_t n = cell->range.volume(); + if constexpr (!std::is_trivially_destructible_v) { + T* elems = inner.data(); + for (std::size_t i = 0; i < n; ++i) elems[i].~T(); + } + if constexpr (!std::is_trivially_destructible_v) { + cell->~Cell(); + } +} + +} // namespace detail + +/// `is_tensor_view` is forward-declared in `tensor/type_traits.h` (primary +/// = `std::false_type`). Specializations for the concrete view types live +/// below; `external/btas.h` adds a spec for `btas::TensorView`. Distinct +/// from `is_tensor_helper`, which is also true for views (they are tensors +/// structurally) -- `is_tensor_view` is the *secondary* gate that opts views +/// out of value-returning member-call paths. + +/// True iff `T` is some `ArenaTensor` -- the arena-pinned view type. +/// Implies `is_tensor_view_v`. Use this trait only where arena slab +/// machinery is actually managed (e.g. clone, serialize, value-returning +/// add/subt/mult that allocate via `arena_trivial_*_pinned`); for the +/// "no value-returning ops on a view" gating use `is_tensor_view_v` instead. +template +struct is_arena_tensor : std::false_type {}; +template +struct is_arena_tensor> : std::true_type {}; +template +inline constexpr bool is_arena_tensor_v = is_arena_tensor::value; + +// Every ArenaTensor is a view. +template +struct is_tensor_view> : std::true_type {}; + +namespace detail { + +/// Register `ArenaTensor` as a tensor: it has the same `.data()` / `.size()` +/// flat-contiguous-storage shape as `TA::Tensor`. This makes +/// `is_tensor` true and `is_tensor_of_tensor>` +/// true via the existing recursion, so kernel-level dispatches +/// (`tensor_reduce`, `inplace_tensor_op`, `tensor_op`, ...) match the same +/// overloads they do for `TA::Tensor` without bespoke arena +/// overloads. To keep ArenaTensor out of value-returning member-call paths +/// (which require allocation that views can't do), `ta_ops_match_tensor` is +/// specialized below to false for `ArenaTensor`. +template +struct is_tensor_helper> : public std::true_type {}; + +/// ArenaTensor's element storage is contiguous and row-major. +template +struct is_contiguous_tensor_helper> : public std::true_type { +}; + +/// `ArenaTensor` counts as one nesting level, so `Tensor>` +/// out-ranks a plain `Tensor`. Without this, `nested_rank` +/// falls through to the primary `= 0` and `einsum`'s `MaxNestedArray` ties a +/// ToT arena array with a plain array, picking the wrong result tile type. +template +constexpr size_t nested_rank> = 1 + nested_rank; + +template +constexpr size_t nested_rank> = + nested_rank>; + +} // namespace detail + +// Note: `detail::TensorInterface` (a.k.a. `TA::TensorMap`) is non-owning, +// but it *does* provide value-returning member arithmetic (`.add()`, +// `.subt()`, ...) that materializes a fresh tensor. So it does NOT +// participate in `is_tensor_view` -- this trait is reserved for views that +// lack value-returning member arith (cannot allocate on their own), like +// `ArenaTensor` and `btas::TensorView`. + +} // namespace TiledArray + +// btas::TensorView is btas's existing non-owning view type. Register it as +// a view too. Forward-declared here (signature mirrors btas/tensorview.h) +// to avoid pulling that header into arena_tensor.h. +namespace btas { +template +class TensorView; +} // namespace btas + +namespace TiledArray { +template +struct is_tensor_view<::btas::TensorView> : std::true_type {}; + +/// Zero all elements of `dst`. No-op on a null view. +template +void zero(ArenaTensor& dst) noexcept { + if (!dst) return; + std::memset(dst.data(), 0, dst.size() * sizeof(T)); +} + +/// Fill `dst` with `value`. No-op on a null view. +template +void fill(ArenaTensor& dst, const U& value) { + if (!dst) return; + std::fill_n(dst.data(), dst.size(), static_cast(value)); +} + +/// `dst *= factor`. No-op on a null view. +template +void scale_to(ArenaTensor& dst, Scalar factor) { + if (!dst) return; + auto* d = dst.data(); + const auto n = dst.size(); + for (std::size_t i = 0; i < n; ++i) d[i] *= factor; +} + +/// `dst += src`. Asserts both views non-null and shape-compatible. +template +void add_to(ArenaTensor& dst, const ArenaTensor& src) { + if (!dst || !src) return; + TA_ASSERT(dst.size() == src.size()); + auto* d = dst.data(); + const auto* s = src.data(); + for (std::size_t i = 0; i < dst.size(); ++i) d[i] += s[i]; +} + +/// `dst -= src`. Asserts both views non-null and shape-compatible. +template +void subt_to(ArenaTensor& dst, const ArenaTensor& src) { + if (!dst || !src) return; + TA_ASSERT(dst.size() == src.size()); + auto* d = dst.data(); + const auto* s = src.data(); + for (std::size_t i = 0; i < dst.size(); ++i) d[i] -= s[i]; +} + +/// `dst *= src` element-wise. Asserts both views non-null and shape-compatible. +template +void mult_to(ArenaTensor& dst, const ArenaTensor& src) { + if (!dst || !src) return; + TA_ASSERT(dst.size() == src.size()); + auto* d = dst.data(); + const auto* s = src.data(); + for (std::size_t i = 0; i < dst.size(); ++i) d[i] *= s[i]; +} + +/// `dst += src * alpha` (in-place BLAS-like AXPY). Asserts both views +/// non-null and shape-compatible. Argument order matches TA's `_to` CPO +/// convention `(result, arg, factor)`; the BLAS name AXPY captures the +/// semantics (in-place, not value-producing). +template +void axpy_to(ArenaTensor& dst, const ArenaTensor& src, + Scalar alpha) { + if (!dst || !src) return; + TA_ASSERT(dst.size() == src.size()); + auto* d = dst.data(); + const auto* s = src.data(); + for (std::size_t i = 0; i < dst.size(); ++i) d[i] += alpha * s[i]; +} + +/// Sum of squared elements; 0 for null views. +template +auto squared_norm(const ArenaTensor& src) noexcept { + T acc{}; + if (!src) return acc; + const auto* s = src.data(); + for (std::size_t i = 0; i < src.size(); ++i) acc += s[i] * s[i]; + return acc; +} + +/// Copy `src` into a freshly-allocated `Standalone`. Returns a default- +/// constructed (null) `Standalone` when `src` is null. +template +Standalone materialize(const ArenaTensor& src) { + if (!src) return Standalone(); + Standalone out(src.range()); + std::copy_n(src.data(), src.size(), out.data()); + return out; +} + +/// GEMM CPO for `ArenaTensor`: accumulates `result += factor * left * right` +/// via BLAS. The result must be pre-allocated (e.g. zero-initialized by +/// `arena_outer_init`) -- this overload never resizes. More specific +/// than `tile_op/tile_interface.h`'s generic `gemm` template (which would +/// otherwise fall through to a nonexistent `result.gemm(...)` member), +/// so partial ordering picks it for `ArenaTensor` arguments. +template +auto gemm(ArenaTensor& result, const ArenaTensor& left, + const ArenaTensor& right, Scalar factor, + const math::GemmHelper& gemm_helper) + -> std::enable_if_t, ArenaTensor&> { + if (!left || !right) return result; + TA_ASSERT(bool(result)); + TA_ASSERT(left.range().rank() == gemm_helper.left_rank()); + TA_ASSERT(right.range().rank() == gemm_helper.right_rank()); + + using integer = math::blas::integer; + integer M, N, K; + gemm_helper.compute_matrix_sizes(M, N, K, left.range(), right.range()); + + const integer lda = + (gemm_helper.left_op() == math::blas::NoTranspose) ? K : M; + const integer ldb = + (gemm_helper.right_op() == math::blas::NoTranspose) ? N : K; + const integer ldc = N; + + math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), M, N, K, + static_cast(factor), left.data(), lda, right.data(), ldb, + T(1), result.data(), ldc); + return result; +} + +} // namespace TiledArray + +#endif // TILEDARRAY_TENSOR_ARENA_TENSOR_H__INCLUDED diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index c68bad8f7a..3fdf86df75 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -1266,6 +1267,53 @@ auto tensor_contract(TensorA const& A, TensorB const& B, return plan.do_perm.C ? permute(result, plan.perm.C.inv()) : result; } +/// In-place contraction. Accumulates `factor * (A contracted with B per +/// plan)` into `result` with beta=1 -- `result` must be pre-allocated and +/// zero-initialized (or carry an existing partial sum to add into). +/// +/// Fast path: when `plan.do_perm.{A,B,C}` are all false (the canonical +/// alignment the expression engine produces), the contraction is exactly +/// one GEMM into `result` via the free `gemm` CPO. Works uniformly for +/// `TA::Tensor` and `ArenaTensor` inner cells. +/// +/// Slow path: when any operand requires permutation, the value-returning +/// `tensor_contract` is called and its result is accumulated into `result` +/// via free `add_to`. This requires materialization, which is incompatible +/// with `ArenaTensor`'s pinned-storage contract; for arena cells the +/// non-canonical case throws (the expression engine should pre-align). +template >> +ResultTensor& tensor_contract_to(ResultTensor& result, TensorA const& A, + TensorB const& B, Scalar factor, + const TensorContractionPlan& plan) { + if (!plan.do_perm.A && !plan.do_perm.B && !plan.do_perm.C) { + return gemm(result, A, B, factor, plan.gemm_helper); + } + constexpr bool any_arena = ::TiledArray::is_arena_tensor_v || + ::TiledArray::is_arena_tensor_v || + ::TiledArray::is_arena_tensor_v; + if constexpr (any_arena) { + TA_EXCEPTION( + "tensor_contract_to: non-canonical plan (do_perm.{A,B,C} not all " + "false) is unsupported for ArenaTensor cells; the expression " + "engine should pre-align inner modes to the canonical layout."); + return result; + } else { + // Value-semantic slow path. tensor_contract internally uses alpha=1; + // restrict callers here to factor=1 so the math matches. Regime-A + // always passes factor=1; lift this restriction only if a real caller + // needs a non-unit scale on the non-canonical path. + using Numeric = typename ResultTensor::numeric_type; + TA_ASSERT(static_cast(factor) == Numeric{1} && + "tensor_contract_to: non-canonical plan currently supports " + "factor == 1 only"); + auto prod = tensor_contract(A, B, plan); + if (!prod.empty()) add_to(result, prod); + return result; + } +} + /// contracts 2 tensors, with 1 plan construction per call. /// Thus this is inefficient; plan should be constructed separately and then /// used to for multiple calls (see the variant of this function that diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h index 05636c3d7d..24fd81c89f 100644 --- a/src/TiledArray/tensor/operators.h +++ b/src/TiledArray/tensor/operators.h @@ -50,7 +50,8 @@ namespace TiledArray { /// \return A tensor where element \c i is equal to tensor[i] + number template >>> + TA::detail::is_nested_tensor_v> && + !TA::is_tensor_view_v>>> inline decltype(auto) operator+( T1&& tensor, TA::detail::numeric_t> number) { return std::forward(tensor).add(number); @@ -65,7 +66,8 @@ inline decltype(auto) operator+( /// \return A tensor where element \c i is equal to tensor[i] + number template >>> + TA::detail::is_nested_tensor_v> && + !TA::is_tensor_view_v>>> inline decltype(auto) operator+( TA::detail::numeric_t> number, T1&& tensor) { return std::forward(tensor).add(number); @@ -80,7 +82,8 @@ inline decltype(auto) operator+( /// \return A tensor where element \c i is equal to tensor[i] - number template >>> + TA::detail::is_nested_tensor_v> && + !TA::is_tensor_view_v>>> inline decltype(auto) operator-( T1&& tensor, TA::detail::numeric_t> number) { return std::forward(tensor).subt(number); diff --git a/src/TiledArray/tensor/operators_body.ipp b/src/TiledArray/tensor/operators_body.ipp index 4e2d736a84..6b625b82b2 100644 --- a/src/TiledArray/tensor/operators_body.ipp +++ b/src/TiledArray/tensor/operators_body.ipp @@ -77,11 +77,15 @@ inline decltype(auto) operator*(N number, T&& tensor) { return scale(std::forward(tensor), number); } -/// tensor += tensor +/// tensor += tensor -- compound assignment is valid for any tensor whose +/// storage can be mutated, including views. Gated on the broader +/// `ta_ops_match_tensor_inplace_v` predicate. template > && - detail::ta_ops_match_tensor_v>>> + detail::ta_ops_match_tensor_inplace_v< + TA::detail::remove_cvr_t> && + detail::ta_ops_match_tensor_inplace_v< + TA::detail::remove_cvr_t>>> inline decltype(auto) operator+=(T1&& left, const T2& right) { return add_to(std::forward(left), right); } @@ -89,8 +93,10 @@ inline decltype(auto) operator+=(T1&& left, const T2& right) { /// tensor -= tensor template > && - detail::ta_ops_match_tensor_v>>> + detail::ta_ops_match_tensor_inplace_v< + TA::detail::remove_cvr_t> && + detail::ta_ops_match_tensor_inplace_v< + TA::detail::remove_cvr_t>>> inline decltype(auto) operator-=(T1&& left, const T2& right) { return subt_to(std::forward(left), right); } @@ -98,8 +104,10 @@ inline decltype(auto) operator-=(T1&& left, const T2& right) { /// tensor *= tensor (element-wise) template > && - detail::ta_ops_match_tensor_v>>> + detail::ta_ops_match_tensor_inplace_v< + TA::detail::remove_cvr_t> && + detail::ta_ops_match_tensor_inplace_v< + TA::detail::remove_cvr_t>>> inline decltype(auto) operator*=(T1&& left, const T2& right) { return mult_to(std::forward(left), right); } @@ -107,7 +115,8 @@ inline decltype(auto) operator*=(T1&& left, const T2& right) { /// tensor *= scalar template > && + detail::ta_ops_match_tensor_inplace_v< + TA::detail::remove_cvr_t> && TA::detail::is_numeric_v>> inline decltype(auto) operator*=(T&& left, N right) { return scale_to(std::forward(left), right); diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index fa04ff7eda..ca67641e83 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -27,6 +27,7 @@ #include "TiledArray/math/blas.h" #include "TiledArray/math/gemm_helper.h" +#include "TiledArray/tensor/arena_kernels.h" #include "TiledArray/tensor/complex.h" #include "TiledArray/tensor/kernels.h" #include "TiledArray/tile_interface/clone.h" @@ -266,7 +267,8 @@ class Tensor { template static decltype(auto) value_converter(const T_& arg) { using arg_type = detail::remove_cvr_t; - if constexpr (detail::is_tensor_v) // clone nested tensors + if constexpr (detail::is_tensor_v && + !is_tensor_view_v) // clone owning nested tensors return arg.clone(); else if constexpr (!std::is_same_v) { // convert if constexpr (std::is_convertible_v) @@ -274,7 +276,7 @@ class Tensor { else return conversions::to()(arg); } else - return arg; + return arg; // identity (for views, copy = rebind, no deep clone) }; range_type range_; ///< Range @@ -369,9 +371,14 @@ class Tensor { : Tensor(range, 1, default_construct{false}) { const auto n = this->size(); pointer MADNESS_RESTRICT const data = this->data(); - Clone cloner; - for (size_type i = 0ul; i < n; ++i) - new (data + i) value_type(cloner(value)); + if constexpr (is_tensor_view_v) { + // Views are rebind-on-copy and lack member `clone`; just copy each. + for (size_type i = 0ul; i < n; ++i) new (data + i) value_type(value); + } else { + Clone cloner; + for (size_type i = 0ul; i < n; ++i) + new (data + i) value_type(cloner(value)); + } } /// Construct a tensor of scalars, setting all elements to the same value @@ -481,8 +488,13 @@ class Tensor { // we do that now constexpr bool is_tot = detail::is_tensor_of_tensor_v; constexpr bool is_bperm = detail::is_bipartite_permutation_v; - // tile ops pass bipartite permutations here even if this is a plain tensor - if constexpr (is_tot && is_bperm) { + constexpr bool is_view = is_tensor_view_v; + // tile ops pass bipartite permutations here even if this is a plain tensor. + // For view inners, the cell has fixed layout that can't be permuted in + // place -- skip the inner-permute pass and rely on callers to arrange + // canonical inner indexing (regime-A einsum's `do_perm.{A,B,C}` bailout + // guarantees no inner permutation is needed for our paths). + if constexpr (is_tot && is_bperm && !is_view) { if (inner_size(perm) != 0) { const auto inner_perm = inner(perm); Permute p; @@ -493,6 +505,12 @@ class Tensor { if (!el.empty()) el = p(el, inner_perm); } } + } else if constexpr (is_tot && is_bperm && is_view) { + if (inner_size(perm) != 0) { + TA_EXCEPTION( + "Tensor: inner permutation requested but view " + "cells cannot be permuted in place"); + } } } @@ -652,8 +670,21 @@ class Tensor { Tensor clone() const& { Tensor result; if (data_) { - if constexpr (detail::is_tensor_of_tensor_v) { - result = Tensor(*this, [](value_type const& el) { return el.clone(); }); + if constexpr (detail::is_tensor_of_tensor_v && + detail::is_ta_tensor_v) { + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* src, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = src[i]; + }; + result = detail::arena_trivial_unary(*this, fill); + } else if constexpr (is_arena_tensor_v) { + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* src, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = src[i]; + }; + result = detail::arena_trivial_unary(*this, fill); } else { result = detail::tensor_op( [](const numeric_type value) -> numeric_type { return value; }, @@ -1190,11 +1221,20 @@ class Tensor { if (!empty) { ar & range; ar & nbatch; - if constexpr (madness::is_input_archive_v) { - *this = Tensor(std::move(range), nbatch, default_construct{true}); + if constexpr (is_arena_tensor_v) { + // ArenaTensor inner cells own no storage themselves; their data + // lives in a per-outer-tile arena slab. Bypass the generic + // wrap(value_type*, N) path (which would try to serialize bare + // Cell* pointers across processes) and manage cell storage at + // this outer-tile boundary instead. The slab is rebuilt on load. + serialize_arena_inner_cells(ar, std::move(range), nbatch); + } else { + if constexpr (madness::is_input_archive_v) { + *this = Tensor(std::move(range), nbatch, default_construct{true}); + } + ar& madness::archive::wrap(this->data_.get(), + this->range_.volume() * nbatch); } - ar& madness::archive::wrap(this->data_.get(), - this->range_.volume() * nbatch); } else { if constexpr (madness::is_input_archive_v) { *this = Tensor{}; @@ -1202,6 +1242,60 @@ class Tensor { } } + private: + /// ArenaTensor-aware inner-cell serialization. Writes per-cell metadata + /// (null flag + range) then element bytes; on load, rebuilds the outer + /// via `arena_outer_init` so the slab is reconstructed in one + /// allocation and the outer-data deleter keeps it alive. + template + void serialize_arena_inner_cells(Archive& ar, range_type range, + std::size_t nbatch) { + using InnerT = value_type; + using InnerRange = typename InnerT::range_type; + const std::size_t N = range.volume() * nbatch; + if constexpr (madness::is_output_archive_v) { + // Per-cell null flags. + for (std::size_t i = 0; i < N; ++i) { + bool not_null = bool(this->data_.get()[i]); + ar & not_null; + } + // Inner ranges for non-null cells only. + for (std::size_t i = 0; i < N; ++i) { + const InnerT& cell = this->data_.get()[i]; + if (cell) ar & cell.range(); + } + // Element bytes for non-null cells only. + for (std::size_t i = 0; i < N; ++i) { + const InnerT& cell = this->data_.get()[i]; + if (cell) ar& madness::archive::wrap(cell.data(), cell.size()); + } + } else { + // Load: read all metadata, plan + allocate slab via the factory, + // then read element bytes into each placed cell's data(). + std::vector flags(N); + for (std::size_t i = 0; i < N; ++i) { + bool f; + ar & f; + flags[i] = f; + } + std::vector ranges(N); + for (std::size_t i = 0; i < N; ++i) { + if (flags[i]) ar& ranges[i]; + } + *this = detail::arena_outer_init( + range, nbatch, [&](std::size_t ord) -> InnerRange { + return flags[ord] ? ranges[ord] : InnerRange{}; + }); + for (std::size_t i = 0; i < N; ++i) { + if (flags[i]) { + InnerT& cell = this->data_.get()[i]; + ar& madness::archive::wrap(cell.data(), cell.size()); + } + } + } + } + + public: /// Swap tensor data /// \param other The tensor to swap with this @@ -1441,7 +1535,27 @@ class Tensor { template >> Tensor permute(const Perm& perm) const { - return Tensor(*this, perm); + if constexpr (is_arena_tensor_v) { + // View inner cells cannot be permuted in place; the owning tile + // rewrites its slab(s). The outer cells reorder shallowly (the 8-byte + // views are reindexed, the slab is shared via keep-alive); a + // non-trivial inner permutation rewrites every cell into a fresh slab. + // The generic Tensor(other, perm) ctor's allocate-then-fill shape does + // not fit the arena slab model, so route around it. + const auto outer_perm = outer(perm); + Tensor result = + (outer_perm && !outer_perm.is_identity()) + ? detail::arena_permute_shallow(*this, outer_perm) + : *this; + if constexpr (detail::is_bipartite_permutation_v) { + const auto inner_perm = inner(perm); + if (inner_perm && !inner_perm.is_identity()) + result = detail::arena_inner_permute(result, inner_perm); + } + return result; + } else { + return Tensor(*this, perm); + } } /// Shift the lower and upper bound of this tensor @@ -1680,10 +1794,27 @@ class Tensor { // early exit for empty this if (empty()) return {}; - return unary([factor](const value_type& a) { - using namespace TiledArray::detail; - return a * factor; - }); + if constexpr (detail::is_tensor_of_tensor_v && + detail::is_ta_tensor_v) { + auto fill = [factor](typename value_type::value_type* dst, + const typename value_type::value_type* src, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = src[i] * factor; + }; + return detail::arena_trivial_unary(*this, fill); + } else if constexpr (is_arena_tensor_v) { + auto fill = [factor](typename value_type::value_type* dst, + const typename value_type::value_type* src, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = src[i] * factor; + }; + return detail::arena_trivial_unary(*this, fill); + } else { + return unary([factor](const value_type& a) { + using namespace TiledArray::detail; + return a * factor; + }); + } } /// Construct a scaled copy of this tensor @@ -1714,12 +1845,19 @@ class Tensor { // early exit for empty this if (empty()) return {}; - return unary( - [factor](const value_type& a) { - using namespace TiledArray::detail; - return a * factor; - }, - perm); + if constexpr (is_tensor_view_v) { + TA_EXCEPTION( + "Tensor::scale(factor, perm): permutation is not " + "supported for view inner cells"); + return Tensor{}; + } else { + return unary( + [factor](const value_type& a) { + using namespace TiledArray::detail; + return a * factor; + }, + perm); + } } /// Scale this tensor @@ -1739,6 +1877,111 @@ class Tensor { // Addition operations + /// Element-wise add for `Tensor` ToT operands. Routes through + /// the arena binary kernel; inner cells have no `operator+` of their own. + template + requires(is_arena_tensor_v && + is_arena_tensor_v) + Tensor add(const Right& right) const { + if (empty()) return detail::clone_or_cast(right); + if (right.empty()) return this->clone(); + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* l, + const typename value_type::value_type* r, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] + r[i]; + }; + return detail::arena_trivial_binary(*this, right, fill); + } + + /// Mixed `Tensor + Tensor`: each inner element is + /// offset by the corresponding outer-cell scalar. Routes through the + /// arena scaled kernel; no operator+ between ArenaTensor and scalar. + template + requires(is_arena_tensor_v && + detail::is_numeric_v) + Tensor add(const Right& right) const { + if (empty() || right.empty()) return {}; + using ElemT = typename value_type::value_type; + using Scalar = typename Right::value_type; + auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = arena[i] + s; + }; + return detail::arena_trivial_scaled(*this, right, fill); + } + + /// Mixed `Tensor + Tensor`: symmetric to above, + /// result has the same ToT layout as the right operand. + template + requires(detail::is_numeric_v && + is_arena_tensor_v) + Right add(const Right& right) const { + if (empty() || right.empty()) return {}; + using ArenaInner = typename Right::value_type; + using ElemT = typename ArenaInner::value_type; + using Scalar = value_type; + auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = s + arena[i]; + }; + return detail::arena_trivial_scaled(right, *this, fill); + } + + /// Scaled element-wise add for `Tensor` ToT operands: + /// `(this + right) * factor`. Routes through the arena binary kernel. + template + requires(is_arena_tensor_v && + is_arena_tensor_v && + detail::is_numeric_v) + Tensor add(const Right& right, const Scalar factor) const { + using ElemT = typename value_type::value_type; + auto fill = [factor](ElemT* dst, const ElemT* l, const ElemT* r, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = (l[i] + r[i]) * factor; + }; + return detail::arena_trivial_binary(*this, right, fill); + } + + /// True if \p perm reorders nothing -- empty or identity. Handles a plain + /// Permutation and a (bipartite) ToT permutation alike. + template + static bool arena_perm_is_trivial(const Perm& perm) { + if constexpr (std::is_same_v) + return !static_cast(perm) || + (perm.first().is_identity() && perm.second().is_identity()); + else + return !static_cast(perm) || perm.is_identity(); + } + + /// Permuted add for `Tensor` ToT operands. A non-trivial + /// permutation of arena ToT tiles is not yet supported; an identity (or + /// null) permutation falls through to the plain element-wise add. + template + requires(is_arena_tensor_v && + is_arena_tensor_v && + detail::is_permutation_v) + Tensor add(const Right& right, const Perm& perm) const { + if (!arena_perm_is_trivial(perm)) + TA_EXCEPTION( + "TA::Tensor::add: permuted add of a tensor-of-tensors " + "is not yet supported"); + return add(right); + } + + /// Permuted scaled add for `Tensor` ToT operands; see the + /// permuted-add overload above for the permutation restriction. + template + requires(is_arena_tensor_v && + is_arena_tensor_v && + detail::is_numeric_v && detail::is_permutation_v) + Tensor add(const Right& right, const Scalar factor, const Perm& perm) const { + if (!arena_perm_is_trivial(perm)) + TA_EXCEPTION( + "TA::Tensor::add: permuted scaled add of a " + "tensor-of-tensors is not yet supported"); + return add(right, factor); + } + /// Add this and \c other to construct a new tensor /// \tparam Right The right-hand tensor type @@ -1748,7 +1991,11 @@ class Tensor { template requires(is_tensor::value && detail::sum_convertible_to&>) + const value_t&> && + !(is_arena_tensor_v && + detail::is_numeric_v) && + !(detail::is_numeric_v && + is_arena_tensor_v)) Tensor add(const Right& right) const { // early exit for empty right if (right.empty()) return this->clone(); @@ -1756,24 +2003,35 @@ class Tensor { // early exit for empty this if (empty()) detail::clone_or_cast(right); - return binary( - right, - [](const value_type& l, const value_t& r) -> decltype(l + r) { - if constexpr (detail::is_tensor_v) { - if (l.empty()) { - if (r.empty()) - return {}; - else - return r.clone(); - } else { - if (r.empty()) - return l.clone(); - else - return l + r; + if constexpr (detail::is_tensor_of_tensor_v && + detail::is_ta_tensor_v && + detail::is_ta_tensor_v) { + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* l, + const typename value_type::value_type* r, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] + r[i]; + }; + return detail::arena_trivial_binary(*this, right, fill); + } else { + return binary( + right, + [](const value_type& l, const value_t& r) -> decltype(l + r) { + if constexpr (detail::is_tensor_v) { + if (l.empty()) { + if (r.empty()) + return {}; + else + return r.clone(); + } else { + if (r.empty()) + return l.clone(); + else + return l + r; + } } - } - return l + r; - }); + return l + r; + }); + } } /// Add this and \c other to construct a new tensor @@ -1800,7 +2058,13 @@ class Tensor { template requires(detail::is_tensor_v && !detail::sum_convertible_to&>) + const value_t&> && + !(is_arena_tensor_v && + is_arena_tensor_v) && + !(is_arena_tensor_v && + detail::is_numeric_v) && + !(detail::is_numeric_v && + is_arena_tensor_v)) auto add(const Right& right) const { return binary(right, [](const value_type& l, const value_t& r) { return l + r; @@ -1932,6 +2196,74 @@ class Tensor { const value_t r) { (l += r) *= factor; }); } + /// axpy: result[i] += arg[i] * factor (factor scales only the + /// added operand, not the existing result). Distinct from + /// `add_to(arg, factor)` which has the legacy `(result + arg) * factor` + /// semantics. Useful as a fused replacement for + /// `add_to(result, scale(arg, factor))` when the intermediate + /// materialization is undesirable (e.g. when `value_type` is a view). + /// + /// The lambda body dispatches by element type so the same body works + /// for flat and ToT tensors -- at the leaf (scalar) level it uses + /// `l += r * factor`; at the cell level it delegates to the cell's + /// `axpy_to` member (free or member, found via ADL). + template + requires(is_tensor::value && detail::is_numeric_v) + Tensor& axpy_to(const Right& right, const Scalar factor) { + if (right.empty()) return *this; + if (empty()) { + *this = detail::clone_or_cast(right); + this->scale_to(factor); + return *this; + } + return inplace_binary(right, + [factor](auto& MADNESS_RESTRICT l, const auto& r) { + using L = std::remove_reference_t; + if constexpr (detail::is_tensor_helper::value) { + l.axpy_to(r, factor); + } else { + l += r * factor; + } + }); + } + + /// axpy with fused permutation on the added operand: + /// result[i] += (perm ^ arg)[i] * factor. + /// + /// Bails for view inner cells (which cannot be permuted in place). + template + requires(is_tensor::value && detail::is_numeric_v && + detail::is_permutation_v) + Tensor& axpy_to(const Right& right, const Scalar factor, const Perm& perm) { + if (right.empty()) return *this; + if constexpr (is_tensor_view_v) { + TA_EXCEPTION( + "Tensor::axpy_to(right, factor, perm): inner " + "permutation is not supported for view inner cells"); + return *this; + } else { + auto permuted = right.permute(perm); + if (empty()) { + // first contribution into an unallocated target (e.g. a contraction + // result inner cell): initialize to factor * (perm ^ arg) rather + // than asserting non-empty in inplace_binary -- mirrors the + // non-permuting axpy_to overload above. + *this = detail::clone_or_cast(permuted); + this->scale_to(factor); + return *this; + } + return inplace_binary( + permuted, [factor](auto& MADNESS_RESTRICT l, const auto& r) { + using L = std::remove_reference_t; + if constexpr (detail::is_tensor_helper::value) { + l.axpy_to(r, factor); + } else { + l += r * factor; + } + }); + } + } + /// Add a constant to this tensor /// \param value The constant to be added @@ -1948,33 +2280,93 @@ class Tensor { /// Subtract \c right from this and return the result + /// Element-wise subtraction for `Tensor` ToT operands. Routes + /// through the arena binary kernel; inner cells have no `operator-`. + template + requires(is_arena_tensor_v && + is_arena_tensor_v) + Tensor subt(const Right& right) const { + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* l, + const typename value_type::value_type* r, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] - r[i]; + }; + return detail::arena_trivial_binary(*this, right, fill); + } + + /// Mixed `Tensor - Tensor`: subtract per-cell scalar + /// from every inner element. Routes through the arena scaled kernel. + template + requires(is_arena_tensor_v && + detail::is_numeric_v) + Tensor subt(const Right& right) const { + if (empty() || right.empty()) return {}; + using ElemT = typename value_type::value_type; + using Scalar = typename Right::value_type; + auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = arena[i] - s; + }; + return detail::arena_trivial_scaled(*this, right, fill); + } + + /// Mixed `Tensor - Tensor`: for each outer cell, + /// broadcast the scalar minus each inner element of the arena side. + template + requires(detail::is_numeric_v && + is_arena_tensor_v) + Right subt(const Right& right) const { + if (empty() || right.empty()) return {}; + using ArenaInner = typename Right::value_type; + using ElemT = typename ArenaInner::value_type; + using Scalar = value_type; + auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = s - arena[i]; + }; + return detail::arena_trivial_scaled(right, *this, fill); + } + /// \tparam Right The right-hand tensor type /// \param right The tensor that will be subtracted from this tensor /// \return A new tensor where the elements are the different between the /// elements of \c this and \c right template >> + detail::tensors_have_equal_nested_rank_v && + !(is_arena_tensor_v && + is_arena_tensor_v)>> Tensor subt(const Right& right) const { - return binary( - right, - [](const value_type& l, const value_t& r) -> decltype(l - r) { - if constexpr (detail::is_tensor_v) { - if (l.empty()) { - if (r.empty()) - return {}; - else - return -r; + if constexpr (detail::is_tensor_of_tensor_v && + detail::is_ta_tensor_v && + detail::is_ta_tensor_v) { + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* l, + const typename value_type::value_type* r, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] - r[i]; + }; + return detail::arena_trivial_binary(*this, right, fill); + } else { + return binary( + right, + [](const value_type& l, const value_t& r) -> decltype(l - r) { + if constexpr (detail::is_tensor_v) { + if (l.empty()) { + if (r.empty()) + return {}; + else + return -r; + } else { + if (r.empty()) + return l.clone(); + else + return l - r; + } } else { - if (r.empty()) - return l.clone(); - else - return l - r; + return l - r; } - } else { - return l - r; - } - }); + }); + } } /// Subtract \c right from this and return the result permuted by \c perm @@ -1990,9 +2382,18 @@ class Tensor { typename std::enable_if::value && detail::is_permutation_v>::type* = nullptr> Tensor subt(const Right& right, const Perm& perm) const { - return binary( - right, [](const value_type& l, const value_type& r) { return l - r; }, - perm); + if constexpr (is_tensor_view_v) { + // Permutation isn't supported for view inner cells (fixed storage + // layout). Subt+permute would require materialization. + TA_EXCEPTION( + "Tensor::subt(right, perm): permutation is not " + "supported for view inner cells"); + return Tensor{}; + } else { + return binary( + right, [](const value_type& l, const value_type& r) { return l - r; }, + perm); + } } /// Subtract \c right from this and return the result scaled by a scaling \c @@ -2009,9 +2410,19 @@ class Tensor { typename std::enable_if::value && detail::is_numeric_v>::type* = nullptr> Tensor subt(const Right& right, const Scalar factor) const { - return binary(right, [factor](const value_type& l, const value_type& r) { - return (l - r) * factor; - }); + if constexpr (is_arena_tensor_v && + is_arena_tensor_v) { + using ElemT = typename value_type::value_type; + auto fill = [factor](ElemT* dst, const ElemT* l, const ElemT* r, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = (l[i] - r[i]) * factor; + }; + return detail::arena_trivial_binary(*this, right, fill); + } else { + return binary(right, [factor](const value_type& l, const value_type& r) { + return (l - r) * factor; + }); + } } /// Subtract \c right from this and return the result scaled by a scaling \c @@ -2030,12 +2441,21 @@ class Tensor { is_tensor::value && detail::is_numeric_v && detail::is_permutation_v>::type* = nullptr> Tensor subt(const Right& right, const Scalar factor, const Perm& perm) const { - return binary( - right, - [factor](const value_type& l, const value_type& r) { - return (l - r) * factor; - }, - perm); + if constexpr (is_arena_tensor_v && + is_arena_tensor_v) { + if (!arena_perm_is_trivial(perm)) + TA_EXCEPTION( + "TA::Tensor::subt: permuted scaled subt of a " + "tensor-of-tensors is not yet supported"); + return subt(right, factor); + } else { + return binary( + right, + [factor](const value_type& l, const value_type& r) { + return (l - r) * factor; + }, + perm); + } } /// Subtract a constant from a copy of this tensor @@ -2108,9 +2528,60 @@ class Tensor { /// \param right The tensor that will be multiplied by this tensor /// \return A new tensor where the elements are the product of the elements /// of \c this and \c right - template >::type* = - nullptr> + /// Element-wise mult for `Tensor` ToT operands. Routes + /// through the arena binary kernel; inner cells have no `operator*`. + template + requires(is_arena_tensor_v && + is_arena_tensor_v) + Tensor mult(const Right& right) const { + if (empty() || right.empty()) return {}; + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* l, + const typename value_type::value_type* r, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] * r[i]; + }; + return detail::arena_trivial_binary(*this, right, fill); + } + + /// Mixed `Tensor * Tensor`: outer Hadamard, each + /// inner cell scaled by the corresponding scalar. Routes through the + /// arena scaled kernel; no operator* between ArenaTensor and scalar. + template + requires(is_arena_tensor_v && + detail::is_numeric_v) + Tensor mult(const Right& right) const { + if (empty() || right.empty()) return {}; + using ElemT = typename value_type::value_type; + using Scalar = typename Right::value_type; + auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = arena[i] * s; + }; + return detail::arena_trivial_scaled(*this, right, fill); + } + + /// Mixed `Tensor * Tensor`: symmetric to above, + /// result has the same ToT layout as the right operand. + template + requires(detail::is_numeric_v && + is_arena_tensor_v) + Right mult(const Right& right) const { + if (empty() || right.empty()) return {}; + using ArenaInner = typename Right::value_type; + using ElemT = typename ArenaInner::value_type; + using Scalar = value_type; + auto fill = [](ElemT* dst, const ElemT* arena, const Scalar& s, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = s * arena[i]; + }; + return detail::arena_trivial_scaled(right, *this, fill); + } + + template < + typename Right, + typename std::enable_if< + detail::is_nested_tensor_v && !is_arena_tensor_v && + !is_arena_tensor_v>::type* = nullptr> decltype(auto) mult(const Right& right) const { auto mult_op = [](const value_type& l, const value_t& r) { return l * r; @@ -2122,7 +2593,18 @@ class Tensor { return res_t{}; } - return binary(right, mult_op); + if constexpr (detail::is_tensor_of_tensor_v && + detail::is_ta_tensor_v && + detail::is_ta_tensor_v) { + auto fill = [](typename value_type::value_type* dst, + const typename value_type::value_type* l, + const typename value_type::value_type* r, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] * r[i]; + }; + return detail::arena_trivial_binary(*this, right, fill); + } else { + return binary(right, mult_op); + } } /// Multiply this by \c right to create a new, permuted tensor @@ -2138,10 +2620,33 @@ class Tensor { typename std::enable_if && detail::is_permutation_v>::type* = nullptr> decltype(auto) mult(const Right& right, const Perm& perm) const { - return binary( - right, - [](const value_type& l, const value_t& r) { return l * r; }, - perm); + if constexpr (is_arena_tensor_v && + is_arena_tensor_v) { + if (!arena_perm_is_trivial(perm)) + TA_EXCEPTION( + "TA::Tensor::mult: permuted mult of a " + "tensor-of-tensors is not yet supported"); + return mult(right); + } else if constexpr (detail::is_numeric_v && + is_arena_tensor_v) { + // t x tot: a plain scalar tile times an arena ToT tile. The 2-arg + // arena overload scales each inner cell into a fresh slab; a + // non-trivial result permutation is then a shallow outer reindex of + // that slab (the inner part is identity for a Hadamard t x tot). + auto result = mult(right); + return arena_perm_is_trivial(perm) ? result : result.permute(perm); + } else if constexpr (is_arena_tensor_v && + detail::is_numeric_v) { + // tot x t: the mirror of the above -- an arena ToT tile times a plain + // scalar tile. Same slab-then-reindex handling. + auto result = mult(right); + return arena_perm_is_trivial(perm) ? result : result.permute(perm); + } else { + return binary( + right, + [](const value_type& l, const value_t& r) { return l * r; }, + perm); + } } /// Scale and multiply this by \c right to create a new tensor @@ -2157,10 +2662,20 @@ class Tensor { typename std::enable_if && detail::is_numeric_v>::type* = nullptr> decltype(auto) mult(const Right& right, const Scalar factor) const { - return binary(right, - [factor](const value_type& l, const value_t& r) { - return (l * r) * factor; - }); + if constexpr (is_arena_tensor_v && + is_arena_tensor_v) { + using ElemT = typename value_type::value_type; + auto fill = [factor](ElemT* dst, const ElemT* l, const ElemT* r, + std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = (l[i] * r[i]) * factor; + }; + return detail::arena_trivial_binary(*this, right, fill); + } else { + return binary(right, + [factor](const value_type& l, const value_t& r) { + return (l * r) * factor; + }); + } } /// Scale and multiply this by \c right to create a new, permuted tensor @@ -2180,12 +2695,21 @@ class Tensor { detail::is_permutation_v>::type* = nullptr> decltype(auto) mult(const Right& right, const Scalar factor, const Perm& perm) const { - return binary( - right, - [factor](const value_type& l, const value_t& r) { - return (l * r) * factor; - }, - perm); + if constexpr (is_arena_tensor_v && + is_arena_tensor_v) { + if (!arena_perm_is_trivial(perm)) + TA_EXCEPTION( + "TA::Tensor::mult: permuted scaled mult of a " + "tensor-of-tensors is not yet supported"); + return mult(right, factor); + } else { + return binary( + right, + [factor](const value_type& l, const value_t& r) { + return (l * r) * factor; + }, + perm); + } } /// Multiply this tensor by \c right @@ -2239,7 +2763,13 @@ class Tensor { // early exit for empty this if (empty()) return this->clone(); - return unary([](const value_type r) { return -r; }); + if constexpr (is_arena_tensor_v) { + Tensor result = this->clone(); + result.scale_to(numeric_type(-1)); + return result; + } else { + return unary([](const value_type r) { return -r; }); + } } /// Create a negated and permuted copy of this tensor @@ -2253,7 +2783,16 @@ class Tensor { // early exit for empty this if (empty()) return this->clone(); - return unary([](const value_type l) { return -l; }, perm); + if constexpr (is_tensor_view_v) { + // View cells cannot be permuted in place (size-fixed); permute is + // intentionally not supported here. + TA_EXCEPTION( + "Tensor::neg(perm): permutation is not supported " + "for view inner cells"); + return Tensor{}; + } else { + return unary([](const value_type l) { return -l; }, perm); + } } /// Negate elements of this tensor @@ -2263,7 +2802,11 @@ class Tensor { // early exit for empty this if (empty()) return *this; - return inplace_unary([](value_type& MADNESS_RESTRICT l) { l = -l; }); + if constexpr (is_tensor_view_v) { + return this->scale_to(numeric_type(-1)); + } else { + return inplace_unary([](value_type& MADNESS_RESTRICT l) { l = -l; }); + } } /// Create a complex conjugated copy of this tensor @@ -2799,6 +3342,19 @@ class Tensor { }; // class Tensor +/// \return the number of bytes an `ArenaTensor` view plus its in-arena cell +/// occupy in memory space `S`. `size_of(Tensor)` recurses here +/// once per inner cell; summed over the outer tile this counts the slab. +template +std::size_t size_of(const ArenaTensor& t) { + std::size_t result = 0; + if constexpr (S == MemorySpace::Host) { + result += sizeof(t); // the one-pointer view itself + if (!t.empty()) result += ArenaTensor::cell_size(t.size()); + } + return result; +} + /// \return the number of bytes used by \p t in memory space /// `S` template diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index ebc04ebe23..1a457e61b1 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -117,18 +117,51 @@ struct is_nested_tensor { template inline constexpr const bool is_nested_tensor_v = is_nested_tensor::value; +} // namespace detail + +/// Forward decl for the tensor-view predicate. Specializations live in +/// `tensor/arena_tensor.h` (`ArenaTensor`, `detail::TensorInterface`) and +/// `external/btas.h` (`btas::TensorView`). Declared here so the operator-body +/// predicates below can consult it without including arena_tensor.h. +template +struct is_tensor_view : std::false_type {}; +template +inline constexpr bool is_tensor_view_v = is_tensor_view::value; + +namespace detail { + /// Predicate used by the shared operator body in -/// @c TiledArray/tensor/operators_body.ipp to gate the element-wise tensor -/// operators that are injected into @c namespace TiledArray . The btas-side -/// copy of the same operators (in @c external/btas.h) partial-specializes -/// this predicate to @c std::false_type for @c btas::Tensor so the two -/// namespaces' operators stay non-overlapping under ADL. +/// @c TiledArray/tensor/operators_body.ipp to gate the **value-returning** +/// element-wise tensor operators (@c +, @c -, @c *, @c neg) that are +/// injected into @c namespace TiledArray. These ops produce a *new* tensor +/// and so are only valid for *freestanding* (owning) tensor types -- a view +/// like `ArenaTensor` cannot allocate on its own. +/// +/// The btas-side copy of the same operators (in @c external/btas.h) +/// partial-specializes this predicate to @c std::false_type for @c +/// btas::Tensor so the two namespaces' operators stay non-overlapping under +/// ADL. template -struct ta_ops_match_tensor : is_nested_tensor {}; +struct ta_ops_match_tensor + : std::bool_constant::value && !is_tensor_view_v> {}; template inline constexpr bool ta_ops_match_tensor_v = ta_ops_match_tensor::value; +/// Predicate used by the operator body to gate the **compound-assignment** +/// (in-place) operators (@c +=, @c -=, @c *=). Mutating ops don't allocate, +/// so they're valid for any tensor whose storage we can mutate -- including +/// views. By default this is the freestanding predicate union'd with the +/// tensor-view predicate; the btas-side copy specializes it the same way it +/// does for the value-returning one. +template +struct ta_ops_match_tensor_inplace + : std::bool_constant::value> {}; + +template +inline constexpr bool ta_ops_match_tensor_inplace_v = + ta_ops_match_tensor_inplace::value; + //////////////////////////////////////////////////////////////////////////////// template @@ -478,6 +511,36 @@ constexpr bool is_annotation_v< >{true}; +// Detect whether T exposes a `rebind_t` member template. Owning tensor +// families (TA::Tensor, btas::Tensor) do; view types (TensorInterface, +// ShiftWrapper, ArenaTensor) do not. +template +struct has_rebind_t : std::false_type {}; +template +struct has_rebind_t>> + : std::true_type {}; + +/// The default freestanding (owning) tensor type associated with tensor type +/// `T` -- the type a value-returning op must produce when handed a `T`. +/// +/// This is purely the *view -> owning-tensor* map; rebinding the element +/// type is a separate concern (`rebind_t`). A tensor that is already +/// freestanding (exposes `rebind_t`, as `TA::Tensor`/`btas::Tensor` do) maps +/// to itself. A *view* type (`ArenaTensor`, `TensorInterface`, ...) cannot be +/// a value result and maps to the owning `TA::Tensor`. A view +/// may specialize this trait to name a different owning family (e.g. +/// `btas::TensorView` -> `btas::Tensor`). The mapped type is always +/// freestanding and therefore always exposes `rebind_t`. +template +struct default_freestanding_tensor { + using type = + std::conditional_t::value, T, + Tensor>; +}; +template +using default_freestanding_tensor_t = + typename default_freestanding_tensor::type; + namespace { template @@ -490,15 +553,6 @@ template constexpr bool is_binop_v>>{true}; -// Detect whether T exposes a `rebind_t` member template. Both TA::Tensor -// and btas::Tensor do; view types like TensorInterface and ShiftWrapper do -// not, so callers must fall back to a concrete tensor for the result type. -template -struct has_rebind_t : std::false_type {}; -template -struct has_rebind_t>> - : std::true_type {}; - template >> @@ -512,19 +566,18 @@ struct result_tensor_helper { public: using numeric_type = binop_result_t; - // Result tensor type stays in TensorA's family with the allocator rebound to - // hold `numeric_type`. TA::Tensor and btas::Tensor expose this as - // `rebind_t` (TA::Tensor via std::allocator_traits::rebind_alloc; btas - // via storage_traits::rebind_t). View types (TensorInterface, ShiftWrapper) - // satisfy is_tensor_v but have no `rebind_t` — fall back to TA::Tensor for - // those. An explicit @tparam Allocator override only applies when TensorA - // is a TA::Tensor. - using result_type = std::conditional_t< - std::is_same_v || !is_ta_tensor_v, - std::conditional_t::value, - typename TensorA_::template rebind_t, - TA::Tensor>, - TA::Tensor>; + // Result tensor type stays in TensorA's *freestanding* family -- TensorA + // itself if already owning, or its owning counterpart if TensorA is a view + // (see `default_freestanding_tensor`) -- with the allocator rebound to hold + // `numeric_type`. The freestanding type always exposes `rebind_t`. An + // explicit @tparam Allocator override only applies when TensorA is a + // TA::Tensor. + using result_type = + std::conditional_t || + !is_ta_tensor_v, + typename default_freestanding_tensor_t< + TensorA_>::template rebind_t, + TA::Tensor>; }; } // namespace diff --git a/src/TiledArray/tile_interface/add.h b/src/TiledArray/tile_interface/add.h index ced9987d45..92bf366026 100644 --- a/src/TiledArray/tile_interface/add.h +++ b/src/TiledArray/tile_interface/add.h @@ -178,6 +178,35 @@ inline decltype(auto) add_to(Result&& result, const Arg& arg, return std::forward(result).add_to(arg, factor); } +/// axpy into the result tile: result[i] += arg[i] * factor. +/// Distinct from `add_to(result, arg, factor)` which has the legacy +/// `(result + arg) * factor` semantics; this one scales only the added +/// operand. Use this in fused-accumulation paths (e.g. an einsum loop +/// computing `out += arg * scalar`) where allocating a scaled temporary +/// would be either wasteful or impossible (e.g. for view tile types that +/// lack value-returning `scale`). +template && + detail::has_member_function_axpy_to_anyreturn_v< + Result&&, const Arg&, const Scalar>>::type* = nullptr> +inline decltype(auto) axpy_to(Result&& result, const Arg& arg, + const Scalar factor) { + return std::forward(result).axpy_to(arg, factor); +} + +/// axpy + fused permutation: result[i] += (perm ^ arg)[i] * factor. +template < + typename Result, typename Arg, typename Scalar, typename Perm, + typename std::enable_if< + detail::is_numeric_v && detail::is_permutation_v && + detail::has_member_function_axpy_to_anyreturn_v< + Result&&, const Arg&, const Scalar, const Perm&>>::type* = nullptr> +inline decltype(auto) axpy_to(Result&& result, const Arg& arg, + const Scalar factor, const Perm& perm) { + return std::forward(result).axpy_to(arg, factor, perm); +} + namespace tile_interface { using TiledArray::add; diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index 2a5e90ea5d..e599110d49 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -28,9 +28,13 @@ #include #include +#include #include #include #include +#include +#include +#include #include "../tile_interface/add.h" #include "../tile_interface/permute.h" @@ -81,23 +85,35 @@ class ContractReduceBase { private: struct Impl { + using left_tile_type = + std::remove_cv_t>; + using right_tile_type = + std::remove_cv_t>; + using arena_plan_storage_t = + TiledArray::detail::arena_plan_storage_t; + template < typename Perm = BipartitePermutation, typename ElemMultAddOp = TiledArray::function_ref, + typename Plan = arena_plan_storage_t, typename = std::enable_if_t< TiledArray::detail::is_permutation_v< std::remove_reference_t> && std::is_invocable_r_v, result_value_type&, const left_value_type&, - const right_value_type&>>> + const right_value_type&> && + std::is_same_v, arena_plan_storage_t>>> Impl(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}, + Plan&& arena_plan_in = {}) : gemm_helper_(left_op, right_op, result_rank, left_rank, right_rank), alpha_(alpha), perm_(std::forward(perm)), - elem_muladd_op_(std::forward(elem_muladd_op)) { + elem_muladd_op_(std::forward(elem_muladd_op)), + arena_plan_(std::forward(arena_plan_in)) { // non-unit alpha must be absorbed into elem_muladd_op if (elem_muladd_op_) TA_ASSERT(alpha == scalar_type(1)); } @@ -111,6 +127,8 @@ class ContractReduceBase { /// type-erased reference to custom element multiply-add op /// \note the lifetime is managed by the callee! TiledArray::function_ref elem_muladd_op_; + + TA_NO_UNIQUE_ADDRESS arena_plan_storage_t arena_plan_; }; std::shared_ptr pimpl_; @@ -125,6 +143,8 @@ class ContractReduceBase { ContractReduceBase_& operator=(const ContractReduceBase_&) = default; ContractReduceBase_& operator=(ContractReduceBase_&&) = default; + using arena_plan_storage_t = typename Impl::arena_plan_storage_t; + /// Construct contract/reduce functor /// \tparam Perm a permutation type @@ -141,21 +161,26 @@ class ContractReduceBase { template < typename Perm = BipartitePermutation, typename ElemMultAddOp = TiledArray::function_ref, + typename Plan = typename Impl::arena_plan_storage_t, typename = std::enable_if_t< TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, - const right_value_type&>>> + const right_value_type&> && + std::is_same_v, + typename Impl::arena_plan_storage_t>>> ContractReduceBase(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, Perm&& perm = {}, - ElemMultAddOp&& elem_muladd_op = {}) + ElemMultAddOp&& elem_muladd_op = {}, + Plan&& arena_plan_in = {}) : pimpl_(std::make_shared( left_op, right_op, alpha, result_rank, left_rank, right_rank, std::forward(perm), - std::forward(elem_muladd_op))) {} + std::forward(elem_muladd_op), + std::forward(arena_plan_in))) {} /// Gemm meta data accessor @@ -189,6 +214,14 @@ class ContractReduceBase { return pimpl_->elem_muladd_op_; } + /// Arena plan accessor + + /// \return A const reference to the arena plan storage + const auto& arena_plan() const { + TA_ASSERT(pimpl_); + return pimpl_->arena_plan_; + } + //-------------- these are only used for unit tests ----------------- /// Compute the number of contracted ranks @@ -277,18 +310,23 @@ class ContractReduce : public ContractReduceBase { template < typename Perm = BipartitePermutation, typename ElemMultAddOp = TiledArray::function_ref, + typename Plan = typename ContractReduceBase_::arena_plan_storage_t, typename = std::enable_if_t< TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, - const right_value_type&>>> + const right_value_type&> && + std::is_same_v, + typename ContractReduceBase_::arena_plan_storage_t>>> ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}, + Plan&& arena_plan_in = {}) : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank, right_rank, std::forward(perm), - std::forward(elem_muladd_op)) {} + std::forward(elem_muladd_op), + std::forward(arena_plan_in)) {} /// Create a result type object @@ -313,8 +351,19 @@ class ContractReduce : public ContractReduceBase { /// target /// \param[in] arg The argument that will be added to \c result void operator()(result_type& result, const result_type& arg) const { - using TiledArray::add_to; - add_to(result, arg); + if constexpr ( + detail::is_contraction_arena_tot_v< + result_type, + std::remove_cv_t>, + std::remove_cv_t>>) { + // Two partial contraction results reduced from disjoint K-panel + // subsets can carry different inner-cell sparsity; union their shapes + // before accumulating. + detail::arena_tot_add_to(result, arg); + } else { + using TiledArray::add_to; + add_to(result, arg); + } } /// Contract a pair of tiles and add to a target tile @@ -332,6 +381,26 @@ class ContractReduce : public ContractReduceBase { if constexpr (!ContractReduceBase_::plain_tensors) { TA_ASSERT(this->elem_muladd_op()); + if constexpr (detail::is_contraction_arena_tot_v< + result_type, + std::remove_cv_t< + std::remove_reference_t>, + std::remove_cv_t< + std::remove_reference_t>>) { + // The result tile is shaped from operand inner cells. A SUMMA + // reduction streams K-panels one at a time: the first panel sizes the + // result; a later panel of a contracted-dimension-sparse ToT operand + // can touch inner cells the first panel left null, so each subsequent + // panel extends the result to cover its own cells. + if (this->arena_plan().has_value()) { + if (empty(result)) + result = this->arena_plan()->reserve_and_construct( + left, right, this->gemm_helper()); + else + this->arena_plan()->grow_to_cover(result, left, right, + this->gemm_helper()); + } + } gemm(result, left, right, ContractReduceBase_::gemm_helper(), this->elem_muladd_op()); } else { // plain tensors @@ -404,18 +473,23 @@ class ContractReduce, + typename Plan = typename ContractReduceBase_::arena_plan_storage_t, typename = std::enable_if_t< TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, - const right_value_type&>>> + const right_value_type&> && + std::is_same_v, + typename ContractReduceBase_::arena_plan_storage_t>>> ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}, + Plan&& arena_plan_in = {}) : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank, right_rank, std::forward(perm), - std::forward(elem_muladd_op)) {} + std::forward(elem_muladd_op), + std::forward(arena_plan_in)) {} /// Create a result type object @@ -530,18 +604,23 @@ class ContractReduce, + typename Plan = typename ContractReduceBase_::arena_plan_storage_t, typename = std::enable_if_t< TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, - const right_value_type&>>> + const right_value_type&> && + std::is_same_v, + typename ContractReduceBase_::arena_plan_storage_t>>> ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}, + Plan&& arena_plan_in = {}) : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank, right_rank, std::forward(perm), - std::forward(elem_muladd_op)) {} + std::forward(elem_muladd_op), + std::forward(arena_plan_in)) {} /// Create a result type object diff --git a/src/TiledArray/tile_op/mult.h b/src/TiledArray/tile_op/mult.h index 329bf96e58..93ad37796f 100644 --- a/src/TiledArray/tile_op/mult.h +++ b/src/TiledArray/tile_op/mult.h @@ -27,6 +27,7 @@ #define TILEDARRAY_TILE_OP_MULT_H__INCLUDED #include +#include #include #include #include @@ -79,6 +80,37 @@ class Mult { /// \note the lifetime is managed by the callee! TiledArray::function_ref element_op_; + /// True when this Mult's result has view inner cells (e.g. ArenaTensor), + /// the only case in which tile_op_ is ever populated. Gates instantiation + /// of eval_tile_op so non-view result types (which need not provide a + /// `permute` member) are unaffected. + static constexpr bool uses_tile_op_ = + TiledArray::is_tensor_view_v; + + /// type-erased reference to a whole-tile op. When set, eval() delegates the + /// entire tile product to it. Used for arena tensor-of-tensors products + /// whose per-cell op cannot value-return (e.g. ArenaTensor view inner + /// cells), so the result tile must be shaped and filled as a unit. + /// \note the lifetime is managed by the callee! + TiledArray::function_ref + tile_op_; + + /// Delegates the whole tile product to tile_op_. + result_type eval_tile_op(const left_type& first, + const right_type& second) const { + return tile_op_(first, second); + } + + /// Delegates the whole tile product to tile_op_, then permutes the result. + template >> + result_type eval_tile_op(const left_type& first, const right_type& second, + const Perm& perm) const { + result_type result = tile_op_(first, second); + if (perm) result = result.permute(perm); + return result; + } + // Permuting tile evaluation function // These operations cannot consume the argument tile since this operation // requires temporary storage space. @@ -86,6 +118,9 @@ class Mult { TiledArray::detail::is_permutation_v>> result_type eval(const left_type& first, const right_type& second, const Perm& perm) const { + if constexpr (uses_tile_op_) { + if (tile_op_) return eval_tile_op(first, second, perm); + } if (!element_op_) { using TiledArray::mult; return mult(first, second, perm); @@ -117,6 +152,9 @@ class Mult { template ::type* = nullptr> result_type eval(const left_type& first, const right_type& second) const { + if constexpr (uses_tile_op_) { + if (tile_op_) return eval_tile_op(first, second); + } if (!element_op_) { using TiledArray::mult; return mult(first, second); @@ -128,9 +166,21 @@ class Mult { template ::type* = nullptr> result_type eval(left_type& first, const right_type& second) const { + if constexpr (uses_tile_op_) { + if (tile_op_) return eval_tile_op(first, second); + } if (!element_op_) { - using TiledArray::mult_to; - return mult_to(std::move(first), second); + if constexpr (uses_tile_op_) { + // View inner cells (e.g. ArenaTensor): a "consumable" tile is a + // shallow handle whose arena slab may be aliased by a persistent + // array, so an in-place mult_to would corrupt that operand. Always + // produce a fresh result for view-cell tiles. + using TiledArray::mult; + return mult(first, second); + } else { + using TiledArray::mult_to; + return mult_to(std::move(first), second); + } } else { // TODO figure out why this does not compiles!!! // using TiledArray::inplace_binary; @@ -144,9 +194,19 @@ class Mult { template ::type* = nullptr> result_type eval(const left_type& first, right_type& second) const { + if constexpr (uses_tile_op_) { + if (tile_op_) return eval_tile_op(first, second); + } if (!element_op_) { - using TiledArray::mult_to; - return mult_to(std::move(second), first); + if constexpr (uses_tile_op_) { + // View inner cells: never consume a shallow handle in place (see the + // consume-left overload above). + using TiledArray::mult; + return mult(first, second); + } else { + using TiledArray::mult_to; + return mult_to(std::move(second), first); + } } else { // WARNING: element_op_ might be noncommuting, so can't swap first // and second! for GEMM could optimize, but can't introspect // element_op_ @@ -195,6 +255,20 @@ class Mult { const left_value_type&, const right_value_type&>>> explicit Mult(ElementOp&& op) : element_op_(std::forward(op)) {} + /// Tag selecting the whole-tile-op constructor. + struct tile_op_tag {}; + + /// Construct using a whole-tile op. When set, eval() delegates the entire + /// tile product to \p op instead of multiplying element-wise. Used for + /// arena tensor-of-tensors products whose per-cell op cannot value-return. + /// \tparam TileOp a callable with signature + /// `result_type(const left_type&, const right_type&)` + /// \param op the whole-tile operation + template , + const left_type&, const right_type&>>> + Mult(tile_op_tag, TileOp&& op) : tile_op_(std::forward(op)) {} + /// Multiply-and-permute operator /// Compute the product of two tiles and permute the result. diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h index b18fc8c7ee..35b89365b8 100644 --- a/src/TiledArray/type_traits.h +++ b/src/TiledArray/type_traits.h @@ -377,6 +377,8 @@ GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(add) GENERATE_HAS_MEMBER_FUNCTION(add) GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(add_to) GENERATE_HAS_MEMBER_FUNCTION(add_to) +GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(axpy_to) +GENERATE_HAS_MEMBER_FUNCTION(axpy_to) GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(subt) GENERATE_HAS_MEMBER_FUNCTION(subt) GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(subt_to) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7a3840ea12..308a3dec1e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -103,6 +103,14 @@ set(ta_test_src_files ta_test.cpp linalg.cpp cp.cpp btas.cpp + arena.cpp + arena_kernels.cpp + arena_einsum_unit_suite.cpp + arena_tot_trivial.cpp + arena_sizeof_invariant_suite.cpp + arena_tensor.cpp + arena_tensor_kernels.cpp + tot_construction.cpp ) if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP) @@ -186,3 +194,8 @@ else() ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT}" ) endif() + +if (NOT TARGET test-cases-tiledarray) + add_custom_target_subproject(tiledarray test-cases) +endif() +add_subdirectory(cases) diff --git a/tests/arena.cpp b/tests/arena.cpp new file mode 100644 index 0000000000..46273e8645 --- /dev/null +++ b/tests/arena.cpp @@ -0,0 +1,131 @@ +#include "TiledArray/tensor/arena.h" + +#include "tiledarray.h" +#include "unit_test_config.h" + +#include +#include +#include +#include + +using TiledArray::detail::Arena; +using TiledArray::detail::ArenaPlan; +using TiledArray::detail::ArenaResource; +using TiledArray::detail::plan; + +namespace { +// Minimal Range-like shim for plan() tests: supports only volume(). +struct FakeRange { + std::size_t v; + std::size_t volume() const noexcept { return v; } +}; +} + +BOOST_AUTO_TEST_SUITE(arena_suite, TA_UT_LABEL_SERIAL) + +BOOST_AUTO_TEST_CASE(default_arena_is_empty) { + Arena a; + BOOST_CHECK_EQUAL(a.capacity(), 0u); + BOOST_CHECK_EQUAL(a.cursor(), 0u); + BOOST_CHECK(a.empty()); + BOOST_CHECK(a.resource() != nullptr); +} + +BOOST_AUTO_TEST_CASE(reserve_initializes_capacity) { + Arena a; + a.reserve(1024); + BOOST_CHECK_EQUAL(a.capacity(), 1024u); + BOOST_CHECK_EQUAL(a.cursor(), 0u); + BOOST_CHECK_EQUAL(a.remaining(), 1024u); +} + +BOOST_AUTO_TEST_CASE(reserve_zero_init_clears_slab) { + Arena a; + a.reserve(64, /*zero_init=*/true); + auto h = a.slice(0, 64); + for (std::size_t i = 0; i < 64; ++i) BOOST_CHECK_EQUAL(h[i], 0u); +} + +BOOST_AUTO_TEST_CASE(slice_random_access_and_aliasing) { + Arena a; + a.reserve(1024); + std::shared_ptr p1 = a.slice(0, 4); + std::shared_ptr p2 = a.slice(64, 4); + for (int i = 0; i < 4; ++i) p1[i] = double(i); + for (int i = 0; i < 4; ++i) p2[i] = double(10 + i); + for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p1[i], double(i)); + for (int i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(p2[i], double(10 + i)); + BOOST_CHECK(static_cast(&p2[0]) >= static_cast(&p1[4])); +} + +BOOST_AUTO_TEST_CASE(claim_advances_cursor_and_aligns) { + Arena a; + a.reserve(1024); + std::shared_ptr h = a.claim(10); + BOOST_REQUIRE(h.get() != nullptr); + BOOST_CHECK_EQUAL(reinterpret_cast(h.get()) % alignof(double), + 0u); + BOOST_CHECK(a.cursor() >= 10u * sizeof(double)); +} + +BOOST_AUTO_TEST_CASE(slab_survives_arena_destruction) { + std::shared_ptr survivor; + { + Arena tmp; + tmp.reserve(256); + survivor = tmp.claim(10); + for (int i = 0; i < 10; ++i) survivor[i] = -i; + } + for (int i = 0; i < 10; ++i) BOOST_CHECK_EQUAL(survivor[i], -i); +} + +BOOST_AUTO_TEST_CASE(plan_uniform_cells) { + ArenaPlan p = plan( + /*N_cells=*/6, + /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{10}; }, + /*element_size=*/sizeof(double), + /*alignment=*/alignof(double)); + BOOST_CHECK_EQUAL(p.total_bytes, 6u * 10u * sizeof(double)); + BOOST_CHECK_EQUAL(p.offsets.size(), 6u); + BOOST_CHECK_EQUAL(p.offsets[0], 0u); + BOOST_CHECK_EQUAL(p.offsets[5], 5u * 10u * sizeof(double)); +} + +BOOST_AUTO_TEST_CASE(plan_variable_cells_match_pivot_doc_example) { + ArenaPlan p = plan( + /*N_cells=*/12, + /*shape_fn=*/[](std::size_t /*ord*/) { return FakeRange{20}; }, + /*element_size=*/sizeof(double), + /*alignment=*/alignof(double)); + BOOST_CHECK_EQUAL(p.total_bytes, 12u * 20u * sizeof(double)); + BOOST_CHECK_EQUAL(p.offsets[1], 20u * sizeof(double)); +} + +BOOST_AUTO_TEST_CASE(plan_then_construct_then_read) { + const std::size_t N = 4; + std::vector volumes = {3, 5, 2, 7}; + auto shape_fn = [&volumes](std::size_t ord) { return FakeRange{volumes[ord]}; }; + ArenaPlan p = plan(N, shape_fn, sizeof(double), alignof(double)); + Arena a; + a.reserve(p.total_bytes); + std::vector> handles(N); + for (std::size_t ord = 0; ord < N; ++ord) { + handles[ord] = a.slice(p.offsets[ord], volumes[ord]); + for (std::size_t i = 0; i < volumes[ord]; ++i) + handles[ord][i] = double(100 * ord + i); + } + for (std::size_t ord = 0; ord < N; ++ord) + for (std::size_t i = 0; i < volumes[ord]; ++i) + BOOST_CHECK_EQUAL(handles[ord][i], double(100 * ord + i)); +} + +BOOST_AUTO_TEST_CASE(arena_resource_is_identity_equal) { + Arena a; + a.reserve(64); + ArenaResource r1(&a); + ArenaResource r2(&a); + BOOST_CHECK(r1.is_equal(r1)); + BOOST_CHECK(!r1.is_equal(r2)); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/arena_einsum_unit_suite.cpp b/tests/arena_einsum_unit_suite.cpp new file mode 100644 index 0000000000..6de13f8c84 --- /dev/null +++ b/tests/arena_einsum_unit_suite.cpp @@ -0,0 +1,253 @@ +/// Unit tests for arena einsum plans and dispatch. + +#include "TiledArray/tensor/arena_einsum.h" + +#include "tiledarray.h" +#include "unit_test_config.h" + +BOOST_AUTO_TEST_SUITE(arena_einsum_unit_suite, TA_UT_LABEL_SERIAL) + +namespace TA = TiledArray; + +BOOST_AUTO_TEST_CASE(inner_shape_plan_left_range) { + TA::Tensor l(TA::Range{3, 4}); + TA::Tensor r(TA::Range{3, 4}); + TA::detail::ArenaInnerShapePlan p{ + TA::detail::ArenaInnerShapeKind::left_range, std::nullopt}; + auto out = p.make(l, r); + BOOST_CHECK(out == l.range()); +} + +BOOST_AUTO_TEST_CASE(inner_shape_plan_right_range) { + TA::Tensor l(TA::Range{3, 4}); + TA::Tensor r(TA::Range{5, 6}); + TA::detail::ArenaInnerShapePlan p{ + TA::detail::ArenaInnerShapeKind::right_range, std::nullopt}; + auto out = p.make(l, r); + BOOST_CHECK(out == r.range()); +} + +BOOST_AUTO_TEST_CASE(inner_shape_plan_gemm_result_range) { + TA::Tensor l(TA::Range{3, 5}); + TA::Tensor r(TA::Range{5, 4}); + TA::math::GemmHelper gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::detail::ArenaInnerShapePlan p{ + TA::detail::ArenaInnerShapeKind::gemm_result_range, + std::make_optional(gh)}; + auto out = p.make(l, r); + BOOST_CHECK_EQUAL(out.volume(), std::size_t{12}); +} + +BOOST_AUTO_TEST_CASE(is_contraction_arena_tot_v_predicate) { + using ToT = TA::Tensor>; + static_assert(TA::detail::is_contraction_arena_tot_v); + using Plain = TA::Tensor; + static_assert(!TA::detail::is_contraction_arena_tot_v); + BOOST_CHECK(true); +} + +BOOST_AUTO_TEST_CASE(arena_plan_storage_t_resolves) { + using ToT = TA::Tensor>; + using Plain = TA::Tensor; + using ToTStorage = TA::detail::arena_plan_storage_t; + using PlainStorage = TA::detail::arena_plan_storage_t; + static_assert(!std::is_same_v); + static_assert(std::is_same_v); + BOOST_CHECK(true); +} + +BOOST_AUTO_TEST_CASE(make_plan_returns_nullopt_when_disabled) { + using ToT = TA::Tensor>; + TA::detail::arena_disabled() = true; + auto plan = TA::detail::make_contraction_arena_plan( + TA::detail::ArenaInnerShapeKind::left_range, std::nullopt, + TA::Permutation{}); + BOOST_CHECK(!plan.has_value()); + TA::detail::arena_disabled() = false; +} + +BOOST_AUTO_TEST_CASE(make_plan_returns_nullopt_for_plain_tensor) { + using Plain = TA::Tensor; + // Non-ToT gating happens inside the function body, not in the return type. + auto plan = TA::detail::make_contraction_arena_plan( + TA::detail::ArenaInnerShapeKind::left_range, std::nullopt, + TA::Permutation{}); + BOOST_CHECK(!plan.has_value()); +} + +BOOST_AUTO_TEST_CASE(make_plan_rejects_nonidentity_inner_perm) { + using ToT = TA::Tensor>; + TA::Permutation perm({1, 0}); + auto plan = TA::detail::make_contraction_arena_plan( + TA::detail::ArenaInnerShapeKind::left_range, std::nullopt, perm); + BOOST_CHECK(!plan.has_value()); +} + +BOOST_AUTO_TEST_CASE(make_plan_returns_active_for_tot) { + using ToT = TA::Tensor>; + auto plan = TA::detail::make_contraction_arena_plan( + TA::detail::ArenaInnerShapeKind::left_range, std::nullopt, + TA::Permutation{}); + BOOST_CHECK(plan.has_value()); +} + +namespace { +using ToT = TA::Tensor>; + +// Placement-new initializes each ToT inner cell in existing tensor storage. +ToT make_uniform_tot(const TA::Range& outer, const TA::Range& inner, + double fill) { + ToT t(outer); + const std::size_t vol = outer.volume(); + for (std::size_t i = 0; i < vol; ++i) { + new (t.data() + i) TA::Tensor(inner, fill); + } + return t; +} +} // namespace + +BOOST_AUTO_TEST_CASE(reserve_and_construct_uniform_inner) { + TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + auto left = make_uniform_tot(TA::Range{2, 3}, TA::Range{3, 5}, 1.0); + auto right = make_uniform_tot(TA::Range{3, 4}, TA::Range{5, 4}, 1.0); + TA::detail::ArenaInnerShapePlan inner_plan{ + TA::detail::ArenaInnerShapeKind::gemm_result_range, + std::make_optional(inner_gh)}; + TA::detail::ContractionArenaPlan plan(inner_plan); + ToT result = plan.reserve_and_construct(left, right, outer_gh); + BOOST_CHECK_EQUAL(result.range().volume(), std::size_t{8}); + BOOST_CHECK_EQUAL(result.data()[0].range().volume(), std::size_t{12}); +} + +BOOST_AUTO_TEST_CASE(reserve_and_construct_zero_volume_outer_skips_reserve) { + TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + auto left = make_uniform_tot(TA::Range{0, 3}, TA::Range{3, 5}, 1.0); + auto right = make_uniform_tot(TA::Range{3, 2}, TA::Range{5, 4}, 1.0); + TA::detail::ArenaInnerShapePlan inner_plan{ + TA::detail::ArenaInnerShapeKind::gemm_result_range, + std::make_optional(inner_gh)}; + TA::detail::ContractionArenaPlan plan(inner_plan); + ToT result = plan.reserve_and_construct(left, right, outer_gh); + BOOST_CHECK_EQUAL(result.range().volume(), std::size_t{0}); +} + +BOOST_AUTO_TEST_CASE(reserve_and_construct_jagged_inner_per_cell) { + // Jagged left cells make first-non-empty K-strip range selection observable. + TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + ToT left(TA::Range{2, 3}); + for (std::size_t m = 0; m < 2; ++m) + for (std::size_t k = 0; k < 3; ++k) { + TA::Range r{static_cast(m + 1), static_cast(k + 2)}; + new (left.data() + (m * 3 + k)) TA::Tensor(r, 1.0); + } + auto right = make_uniform_tot(TA::Range{3, 2}, TA::Range{2, 2}, 1.0); + TA::detail::ArenaInnerShapePlan inner_plan{ + TA::detail::ArenaInnerShapeKind::left_range, std::nullopt}; + TA::detail::ContractionArenaPlan plan(inner_plan); + ToT result = plan.reserve_and_construct(left, right, outer_gh); + BOOST_CHECK_EQUAL(result.range().volume(), std::size_t{4}); + BOOST_CHECK_EQUAL(result.data()[0].range().volume(), std::size_t{2}); + BOOST_CHECK_EQUAL(result.data()[1].range().volume(), std::size_t{2}); + BOOST_CHECK_EQUAL(result.data()[2].range().volume(), std::size_t{4}); + BOOST_CHECK_EQUAL(result.data()[3].range().volume(), std::size_t{4}); +} + +BOOST_AUTO_TEST_CASE(fused_hadamard_inplace_accumulates) { + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor l(TA::Range{4}, 1.0); + TA::Tensor rr(TA::Range{4}, 2.0); + TA::detail::fused_hadamard_inplace(r, l, rr); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 2.0, 1e-12); + TA::detail::fused_hadamard_inplace(r, l, rr); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 4.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_hadamard_scaled_inplace_accumulates) { + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor l(TA::Range{4}, 1.0); + TA::Tensor rr(TA::Range{4}, 2.0); + TA::detail::fused_hadamard_scaled_inplace(r, l, rr, 3.0); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 6.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_scale_tot_x_t_inplace_accumulates) { + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor l(TA::Range{4}, 1.5); + TA::detail::fused_scale_tot_x_t_inplace(r, l, 2.0); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 3.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_scale_t_x_tot_inplace_accumulates) { + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor rr(TA::Range{4}, 2.5); + TA::detail::fused_scale_t_x_tot_inplace(r, 4.0, rr); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 10.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_contraction_inplace_accumulates) { + TA::Tensor r(TA::Range{2, 2}, 0.0); + TA::Tensor l(TA::Range{2, 2}, 1.0); + TA::Tensor rr(TA::Range{2, 2}, 2.0); + TA::math::GemmHelper gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::detail::fused_contraction_inplace(r, l, rr, 1.0, gh); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 4.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_hadamard_lambda_round_trip) { + auto fn = TA::detail::make_fused_hadamard_lambda< + TA::Tensor, TA::Tensor, TA::Tensor>(); + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor l(TA::Range{4}, 1.0); + TA::Tensor rr(TA::Range{4}, 2.0); + fn(r, l, rr); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 2.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_hadamard_scaled_lambda_round_trip) { + auto fn = TA::detail::make_fused_hadamard_scaled_lambda< + TA::Tensor, TA::Tensor, TA::Tensor, double>(3.0); + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor l(TA::Range{4}, 1.0); + TA::Tensor rr(TA::Range{4}, 2.0); + fn(r, l, rr); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 6.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_scale_tot_x_t_lambda_round_trip) { + auto fn = TA::detail::make_fused_scale_tot_x_t_lambda< + TA::Tensor, TA::Tensor, double>(); + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor l(TA::Range{4}, 1.5); + fn(r, l, 2.0); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 3.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(fused_scale_t_x_tot_lambda_round_trip) { + auto fn = TA::detail::make_fused_scale_t_x_tot_lambda< + TA::Tensor, double, TA::Tensor>(); + TA::Tensor r(TA::Range{4}, 0.0); + TA::Tensor rr(TA::Range{4}, 2.5); + fn(r, 4.0, rr); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_CLOSE(r.data()[i], 10.0, 1e-12); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/arena_kernels.cpp b/tests/arena_kernels.cpp new file mode 100644 index 0000000000..4e278fd495 --- /dev/null +++ b/tests/arena_kernels.cpp @@ -0,0 +1,160 @@ +/// Unit tests for arena-backed ToT kernels. + +#include "TiledArray/tensor/arena_kernels.h" + +#include "TiledArray/tensor.h" +#include "TiledArray/tensor/arena.h" +#include "tiledarray.h" +#include "unit_test_config.h" + +#include +#include + +namespace TA = TiledArray; +using inner_t = TA::Tensor; +using outer_t = TA::Tensor; + +namespace { + +outer_t make_tot(std::size_t N_outer, std::size_t n_inner, double base = 1.0) { + outer_t outer(TA::Range{static_cast(N_outer)}, 1); + for (std::size_t ord = 0; ord < N_outer; ++ord) { + inner_t inner(TA::Range{static_cast(n_inner)}); + for (std::size_t i = 0; i < n_inner; ++i) + inner.at_ordinal(i) = base + ord * 100.0 + i; + *(outer.data() + ord) = std::move(inner); + } + return outer; +} + +bool tot_equal(const outer_t& a, const outer_t& b) { + if (a.range().volume() != b.range().volume()) return false; + for (std::size_t ord = 0; ord < a.range().volume(); ++ord) { + const inner_t& ai = *(a.data() + ord); + const inner_t& bi = *(b.data() + ord); + if (ai.range().volume() != bi.range().volume()) return false; + for (std::size_t i = 0; i < ai.range().volume(); ++i) + if (ai.at_ordinal(i) != bi.at_ordinal(i)) return false; + } + return true; +} + +} // namespace + +BOOST_AUTO_TEST_SUITE(arena_kernels_suite, TA_UT_LABEL_SERIAL) + +BOOST_AUTO_TEST_CASE(trivial_unary_clone_matches_heap_baseline) { + outer_t src = make_tot(4, 5, 1.0); + auto fill = [](double* dst, const double* src, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = src[i]; + }; + outer_t arena_result = TA::detail::arena_trivial_unary(src, fill); + BOOST_CHECK(tot_equal(arena_result, src)); +} + +BOOST_AUTO_TEST_CASE(trivial_unary_scale_matches_heap_baseline) { + outer_t src = make_tot(4, 5, 1.0); + const double factor = 2.5; + auto fill = [factor](double* dst, const double* src, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = src[i] * factor; + }; + outer_t arena_result = TA::detail::arena_trivial_unary(src, fill); + outer_t baseline(src.range(), 1); + for (std::size_t ord = 0; ord < src.range().volume(); ++ord) { + inner_t inner((src.data() + ord)->range()); + for (std::size_t i = 0; i < inner.range().volume(); ++i) + inner.at_ordinal(i) = (src.data() + ord)->at_ordinal(i) * factor; + *(baseline.data() + ord) = std::move(inner); + } + BOOST_CHECK(tot_equal(arena_result, baseline)); +} + +BOOST_AUTO_TEST_CASE(trivial_binary_add_matches_heap_baseline) { + outer_t L = make_tot(4, 5, 1.0); + outer_t R = make_tot(4, 5, 0.5); + auto fill = [](double* dst, const double* l, const double* r, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = l[i] + r[i]; + }; + outer_t arena_result = TA::detail::arena_trivial_binary(L, R, fill); + outer_t baseline(L.range(), 1); + for (std::size_t ord = 0; ord < L.range().volume(); ++ord) { + inner_t inner((L.data() + ord)->range()); + for (std::size_t i = 0; i < inner.range().volume(); ++i) + inner.at_ordinal(i) = + (L.data() + ord)->at_ordinal(i) + (R.data() + ord)->at_ordinal(i); + *(baseline.data() + ord) = std::move(inner); + } + BOOST_CHECK(tot_equal(arena_result, baseline)); +} + +BOOST_AUTO_TEST_CASE(arena_outlives_kernel_call) { + // The result data deleter co-owns the arena. + outer_t arena_result; + { + outer_t src = make_tot(3, 4, 7.0); + auto fill = [](double* dst, const double* src, std::size_t n) { + for (std::size_t i = 0; i < n; ++i) dst[i] = src[i]; + }; + arena_result = TA::detail::arena_trivial_unary(src, fill); + } + for (std::size_t ord = 0; ord < arena_result.range().volume(); ++ord) + for (std::size_t i = 0; i < (arena_result.data() + ord)->range().volume(); + ++i) + BOOST_CHECK_EQUAL((arena_result.data() + ord)->at_ordinal(i), + 7.0 + ord * 100.0 + i); +} + +BOOST_AUTO_TEST_CASE(inner_permute_transposes_each_cell) { + // outer tile of 3 cells, each a non-uniform r x c inner matrix + outer_t src(TA::Range{3}, 1); + for (std::size_t ord = 0; ord < 3; ++ord) { + const long r = 2 + static_cast(ord); // 2, 3, 4 + const long c = 3 + static_cast(ord % 2); // 3, 4, 3 + inner_t inner(TA::Range{r, c}); + for (long i = 0; i < r; ++i) + for (long j = 0; j < c; ++j) + inner(i, j) = 1.0 + ord * 100.0 + i * 10.0 + j; + *(src.data() + ord) = std::move(inner); + } + auto result = + TA::detail::arena_inner_permute(src, TA::Permutation{1, 0}); + for (std::size_t ord = 0; ord < 3; ++ord) { + const inner_t& s = *(src.data() + ord); + const inner_t& d = *(result.data() + ord); + const long r = s.range().extent(0); + const long c = s.range().extent(1); + BOOST_REQUIRE_EQUAL(d.range().rank(), 2u); + BOOST_CHECK_EQUAL(d.range().extent(0), c); + BOOST_CHECK_EQUAL(d.range().extent(1), r); + for (long i = 0; i < r; ++i) + for (long j = 0; j < c; ++j) BOOST_CHECK_EQUAL(d(j, i), s(i, j)); + } +} + +BOOST_AUTO_TEST_CASE(inner_permute_rank3_cell) { + outer_t src(TA::Range{2}, 1); + const long e0 = 2, e1 = 3, e2 = 4; + for (std::size_t ord = 0; ord < 2; ++ord) { + inner_t inner(TA::Range{e0, e1, e2}); + for (long a = 0; a < e0; ++a) + for (long b = 0; b < e1; ++b) + for (long c = 0; c < e2; ++c) + inner(a, b, c) = ord * 1000.0 + a * 100.0 + b * 10.0 + c; + *(src.data() + ord) = std::move(inner); + } + // perm {2,0,1}: src dim 0->2, 1->0, 2->1 => result(b,c,a) == src(a,b,c) + auto result = + TA::detail::arena_inner_permute(src, TA::Permutation{2, 0, 1}); + for (std::size_t ord = 0; ord < 2; ++ord) { + const inner_t& s = *(src.data() + ord); + const inner_t& d = *(result.data() + ord); + BOOST_CHECK_EQUAL(d.range().extent(0), e1); + BOOST_CHECK_EQUAL(d.range().extent(1), e2); + BOOST_CHECK_EQUAL(d.range().extent(2), e0); + for (long a = 0; a < e0; ++a) + for (long b = 0; b < e1; ++b) + for (long c = 0; c < e2; ++c) BOOST_CHECK_EQUAL(d(b, c, a), s(a, b, c)); + } +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/arena_sizeof_invariant_suite.cpp b/tests/arena_sizeof_invariant_suite.cpp new file mode 100644 index 0000000000..649e3a50c5 --- /dev/null +++ b/tests/arena_sizeof_invariant_suite.cpp @@ -0,0 +1,64 @@ +/// Locks plain-tensor zero-overhead from the arena plan storage field. + +#include "TiledArray/tensor.h" +#include "TiledArray/tensor/arena_einsum.h" +#include "TiledArray/util/function.h" +#include "tiledarray.h" +#include "unit_test_config.h" + +#include +#include +#include + +namespace TA = TiledArray; + +namespace { + +using PlainResult = TA::Tensor; +using PlainLeft = TA::Tensor; +using PlainRight = TA::Tensor; +using PlainScalar = double; + +using PlainArenaPlanStorage = + TA::detail::arena_plan_storage_t; + +using PlainElemMulAddOp = + TA::function_ref; + +/// Shadows the public field order of ContractReduceBase::Impl on master. +struct ImplLayoutMaster { + TA::math::GemmHelper gemm_helper_; + PlainScalar alpha_; + TA::BipartitePermutation perm_; + PlainElemMulAddOp elem_muladd_op_; +}; + +/// Same as ImplLayoutMaster + trailing TA_NO_UNIQUE_ADDRESS arena_plan_. +struct ImplLayoutAllocator { + TA::math::GemmHelper gemm_helper_; + PlainScalar alpha_; + TA::BipartitePermutation perm_; + PlainElemMulAddOp elem_muladd_op_; + TA_NO_UNIQUE_ADDRESS PlainArenaPlanStorage arena_plan_; +}; + +static_assert(std::is_same_v, + "plain-tensor arena_plan_storage_t must be std::monostate"); + +static_assert(sizeof(ImplLayoutAllocator) == sizeof(ImplLayoutMaster), + "TA_NO_UNIQUE_ADDRESS failed to fold arena_plan_ into padding"); + +} + +BOOST_AUTO_TEST_SUITE(arena_sizeof_invariant_suite, TA_UT_LABEL_SERIAL) + +BOOST_AUTO_TEST_CASE(impl_layout_no_unique_address_invariant) { + BOOST_CHECK_EQUAL(sizeof(ImplLayoutAllocator), sizeof(ImplLayoutMaster)); +} + +BOOST_AUTO_TEST_CASE(plain_arena_plan_storage_is_monostate) { + BOOST_CHECK((std::is_same_v)); + BOOST_CHECK_EQUAL(sizeof(PlainArenaPlanStorage), sizeof(std::monostate)); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/arena_tensor.cpp b/tests/arena_tensor.cpp new file mode 100644 index 0000000000..9b47e1116f --- /dev/null +++ b/tests/arena_tensor.cpp @@ -0,0 +1,338 @@ +/// Unit tests for TiledArray::ArenaTensor: null state, view copy/move, +/// foreign-tensor assignment, in-place CPOs, materialize. + +#include "TiledArray/tensor/arena_tensor.h" + +#include "TiledArray/external/btas.h" +#include "TiledArray/tensor.h" +#include "TiledArray/tensor/tensor_map.h" +#include "tiledarray.h" +#include "unit_test_config.h" + +#include +#include + +#include +#include +#include +#include +#include + +namespace TA = TiledArray; +// Tests use TA::Range explicitly so the standalone target for materialize() +// is the natural TA::Tensor; the type's default range is +// btas::zb::RangeNd, which pairs with btas::Tensor as the standalone. +using Inner = TA::ArenaTensor; + +namespace { + +/// Holds an over-aligned byte buffer big enough for a single `Inner` cell of +/// `n` elements. +struct CellBuf { + std::vector bytes; + std::byte* aligned_ptr = nullptr; + + explicit CellBuf(std::size_t n_elems) { + const std::size_t total = Inner::cell_size(n_elems); + const std::size_t algn = Inner::cell_alignment(); + bytes.assign(total + algn, std::byte{0}); + auto base = reinterpret_cast(bytes.data()); + auto aligned = (base + algn - 1) & ~(algn - 1); + aligned_ptr = reinterpret_cast(aligned); + } +}; + +} // namespace + +BOOST_AUTO_TEST_SUITE(arena_tensor_suite, TA_UT_LABEL_SERIAL) + +BOOST_AUTO_TEST_CASE(sizeof_is_one_pointer) { + BOOST_CHECK_EQUAL(sizeof(Inner), sizeof(void*)); +} + +BOOST_AUTO_TEST_CASE(sizeof_invariant_across_range_parameterizations) { + // `ArenaTensor`'s footprint must be one pointer regardless of the range + // template parameter -- this is the original motivation for the type. The + // default `btas::zb::RangeNd<>` (~14 B + alignment) and `TA::Range` + // (~300 B) both go behind the same `Cell*` indirection. + static_assert(sizeof(TA::ArenaTensor) == sizeof(void*), + "default-range ArenaTensor must be one pointer"); + static_assert( + sizeof(TA::ArenaTensor>) == sizeof(void*), + "zb::RangeNd ArenaTensor must be one pointer"); + static_assert(sizeof(TA::ArenaTensor) == sizeof(void*), + "TA::Range ArenaTensor must be one pointer"); + // Different element type same story. + static_assert(sizeof(TA::ArenaTensor) == sizeof(void*)); + static_assert(sizeof(TA::ArenaTensor>) == sizeof(void*)); + BOOST_CHECK(true); +} + +BOOST_AUTO_TEST_CASE(element_data_is_simd_aligned) { + // data_alignment() should be at least kInnerSimdAlign; cell_alignment() + // should propagate that so the element pointer is SIMD-aligned. + BOOST_CHECK(Inner::data_alignment() >= TA::kInnerSimdAlign); + BOOST_CHECK_EQUAL(Inner::data_alignment() % TA::kInnerSimdAlign, 0u); + BOOST_CHECK(Inner::cell_alignment() >= Inner::data_alignment()); + CellBuf buf(8); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{8}); + auto addr = reinterpret_cast(x.data()); + BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); +} + +BOOST_AUTO_TEST_CASE(default_constructed_is_null) { + Inner x; + BOOST_CHECK(!x); + BOOST_CHECK(x.empty()); + BOOST_CHECK_EQUAL(x.size(), 0u); + BOOST_CHECK(x.data() == nullptr); +} + +BOOST_AUTO_TEST_CASE(make_arena_tensor_zero_initialized) { + CellBuf buf(6); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{6}); + BOOST_REQUIRE(bool(x)); + BOOST_CHECK(!x.empty()); + BOOST_CHECK_EQUAL(x.size(), 6u); + for (std::size_t i = 0; i < x.size(); ++i) + BOOST_CHECK_EQUAL(x.data()[i], 0.0); +} + +BOOST_AUTO_TEST_CASE(copy_construction_yields_alias) { + CellBuf buf(4); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{4}); + Inner y = x; + BOOST_CHECK(bool(x)); + BOOST_CHECK(bool(y)); + BOOST_CHECK_EQUAL(x.data(), y.data()); + y.data()[0] = 42.0; + BOOST_CHECK_EQUAL(x.data()[0], 42.0); +} + +BOOST_AUTO_TEST_CASE(move_leaves_source_null) { + CellBuf buf(4); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{4}); + Inner y = std::move(x); + BOOST_CHECK(!x); + BOOST_CHECK(bool(y)); + BOOST_CHECK_EQUAL(y.size(), 4u); +} + +BOOST_AUTO_TEST_CASE(operator_assign_from_ta_tensor_copies_elements) { + CellBuf buf(5); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{5}); + TA::Tensor src(TA::Range{5}, 0.0); + for (std::size_t i = 0; i < 5; ++i) src.data()[i] = double(i + 1); + x = src; + for (std::size_t i = 0; i < 5; ++i) + BOOST_CHECK_EQUAL(x.data()[i], double(i + 1)); +} + +BOOST_AUTO_TEST_CASE(zero_fills_with_zeros) { + CellBuf buf(4); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{4}); + for (std::size_t i = 0; i < 4; ++i) x.data()[i] = 7.0; + TA::zero(x); + for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(x.data()[i], 0.0); +} + +BOOST_AUTO_TEST_CASE(fill_sets_all_elements) { + CellBuf buf(4); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{4}); + TA::fill(x, 3.5); + for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(x.data()[i], 3.5); +} + +BOOST_AUTO_TEST_CASE(scale_to_multiplies_in_place) { + CellBuf buf(4); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{4}); + TA::fill(x, 2.0); + TA::scale_to(x, 3.0); + for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(x.data()[i], 6.0); +} + +BOOST_AUTO_TEST_CASE(add_to_accumulates) { + CellBuf bd(4), bs(4); + Inner dst = + TA::detail::make_arena_tensor_in(bd.aligned_ptr, TA::Range{4}); + Inner src = + TA::detail::make_arena_tensor_in(bs.aligned_ptr, TA::Range{4}); + TA::fill(dst, 1.0); + TA::fill(src, 2.0); + TA::add_to(dst, src); + for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 3.0); +} + +BOOST_AUTO_TEST_CASE(subt_to_subtracts) { + CellBuf bd(4), bs(4); + Inner dst = + TA::detail::make_arena_tensor_in(bd.aligned_ptr, TA::Range{4}); + Inner src = + TA::detail::make_arena_tensor_in(bs.aligned_ptr, TA::Range{4}); + TA::fill(dst, 5.0); + TA::fill(src, 2.0); + TA::subt_to(dst, src); + for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 3.0); +} + +BOOST_AUTO_TEST_CASE(mult_to_does_elementwise) { + CellBuf bd(4), bs(4); + Inner dst = + TA::detail::make_arena_tensor_in(bd.aligned_ptr, TA::Range{4}); + Inner src = + TA::detail::make_arena_tensor_in(bs.aligned_ptr, TA::Range{4}); + TA::fill(dst, 4.0); + TA::fill(src, 0.5); + TA::mult_to(dst, src); + for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 2.0); +} + +BOOST_AUTO_TEST_CASE(axpy_to_scales_and_adds) { + CellBuf bd(4), bs(4); + Inner dst = + TA::detail::make_arena_tensor_in(bd.aligned_ptr, TA::Range{4}); + Inner src = + TA::detail::make_arena_tensor_in(bs.aligned_ptr, TA::Range{4}); + TA::fill(dst, 1.0); + TA::fill(src, 2.0); + // axpy_to(y, x, alpha): y += alpha * x (in-place, BLAS-like AXPY). + TA::axpy_to(dst, src, 3.0); + for (std::size_t i = 0; i < 4; ++i) BOOST_CHECK_EQUAL(dst.data()[i], 7.0); +} + +BOOST_AUTO_TEST_CASE(squared_norm_sums_squares) { + CellBuf buf(3); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{3}); + x.data()[0] = 1.0; + x.data()[1] = 2.0; + x.data()[2] = 2.0; + BOOST_CHECK_EQUAL(TA::squared_norm(x), 9.0); +} + +BOOST_AUTO_TEST_CASE(in_place_cpos_no_op_on_null) { + Inner null; + TA::zero(null); + TA::fill(null, 1.0); + TA::scale_to(null, 2.0); + TA::add_to(null, null); + BOOST_CHECK_EQUAL(TA::squared_norm(null), 0.0); +} + +BOOST_AUTO_TEST_CASE(materialize_returns_independent_standalone) { + CellBuf buf(4); + Inner x = + TA::detail::make_arena_tensor_in(buf.aligned_ptr, TA::Range{4}); + for (std::size_t i = 0; i < 4; ++i) x.data()[i] = double(i); + auto standalone = TA::materialize>(x); + BOOST_REQUIRE_EQUAL(standalone.range().volume(), 4u); + for (std::size_t i = 0; i < 4; ++i) + BOOST_CHECK_EQUAL(standalone.data()[i], double(i)); + standalone.data()[0] = 99.0; + BOOST_CHECK_EQUAL(x.data()[0], 0.0); +} + +BOOST_AUTO_TEST_CASE(materialize_null_yields_empty_standalone) { + Inner null; + auto standalone = TA::materialize>(null); + BOOST_CHECK(standalone.empty()); +} + +BOOST_AUTO_TEST_CASE(is_arena_tensor_v_predicate) { + static_assert(TA::is_arena_tensor_v); + static_assert(!TA::is_arena_tensor_v>); + static_assert(!TA::is_arena_tensor_v); + BOOST_CHECK(true); +} + +BOOST_AUTO_TEST_CASE(is_tensor_view_v_predicate) { + // ArenaTensor is a view that lacks value-returning member arithmetic -- + // it cannot allocate on its own. `is_tensor_view_v` is the predicate that + // opts such types out of value-returning operator dispatch. + static_assert(TA::is_tensor_view_v); + // btas::TensorView is also a view without member arithmetic. + static_assert(TA::is_tensor_view_v>); + // TA::TensorMap (TensorInterface) is non-owning, but DOES provide + // value-returning member arithmetic (it materializes a fresh tensor), so + // it is intentionally NOT in `is_tensor_view`. + static_assert(!TA::is_tensor_view_v>); + static_assert(!TA::is_tensor_view_v>); + // Value-semantic tensors and scalars are not views. + static_assert(!TA::is_tensor_view_v>); + static_assert(!TA::is_tensor_view_v>); + static_assert(!TA::is_tensor_view_v); + // Layering: is_arena_tensor_v implies is_tensor_view_v. + static_assert(!TA::is_arena_tensor_v>); + static_assert(!TA::is_arena_tensor_v>); + BOOST_CHECK(true); +} + +BOOST_AUTO_TEST_CASE(gemm_inner_matrix_product) { + // C[3,5] += A[3,4] * B[4,5]; A is 1..12 row-major, B is 0.0,0.5,...,9.5. + CellBuf bl(12), br(20), bc(15); + Inner left = + TA::detail::make_arena_tensor_in(bl.aligned_ptr, TA::Range{3, 4}); + Inner right = + TA::detail::make_arena_tensor_in(br.aligned_ptr, TA::Range{4, 5}); + Inner result = + TA::detail::make_arena_tensor_in(bc.aligned_ptr, TA::Range{3, 5}); + for (int i = 0; i < 12; ++i) left.data()[i] = double(i + 1); + for (int i = 0; i < 20; ++i) right.data()[i] = 0.5 * double(i); + TA::zero(result); + + TA::math::GemmHelper helper(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::gemm(result, left, right, 1.0, helper); + + // Row-major reference: ref[i,k] = sum_j A[i,j] * B[j,k]. + double ref[15] = {0}; + for (int i = 0; i < 3; ++i) + for (int k = 0; k < 5; ++k) + for (int j = 0; j < 4; ++j) + ref[i * 5 + k] += left.data()[i * 4 + j] * right.data()[j * 5 + k]; + for (int i = 0; i < 15; ++i) + BOOST_CHECK_CLOSE(result.data()[i], ref[i], 1e-12); +} + +BOOST_AUTO_TEST_CASE(gemm_inner_accumulates_into_result) { + // C starts at known nonzero, gemm accumulates (beta=1). + CellBuf bl(4), br(4), bc(4); + Inner left = + TA::detail::make_arena_tensor_in(bl.aligned_ptr, TA::Range{2, 2}); + Inner right = + TA::detail::make_arena_tensor_in(br.aligned_ptr, TA::Range{2, 2}); + Inner result = + TA::detail::make_arena_tensor_in(bc.aligned_ptr, TA::Range{2, 2}); + TA::fill(left, 1.0); + TA::fill(right, 2.0); + TA::fill(result, 10.0); // preload + + TA::math::GemmHelper helper(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::gemm(result, left, right, 1.0, helper); + // Each result entry: 10 (preload) + 2 (sum_j 1*2 over j=0..1) = 14. + for (int i = 0; i < 4; ++i) BOOST_CHECK_CLOSE(result.data()[i], 14.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(gemm_inner_skips_when_operand_null) { + // Null operands -> result unchanged (no-op). + CellBuf bc(4); + Inner result = + TA::detail::make_arena_tensor_in(bc.aligned_ptr, TA::Range{2, 2}); + TA::fill(result, 7.0); + Inner null_inner; + TA::math::GemmHelper helper(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::gemm(result, null_inner, null_inner, 1.0, helper); + for (int i = 0; i < 4; ++i) BOOST_CHECK_CLOSE(result.data()[i], 7.0, 1e-12); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/arena_tensor_kernels.cpp b/tests/arena_tensor_kernels.cpp new file mode 100644 index 0000000000..fea3046aa8 --- /dev/null +++ b/tests/arena_tensor_kernels.cpp @@ -0,0 +1,633 @@ +/// Tests for the arena-backed factory that builds an outer tile of +/// `ArenaTensor` cells: SIMD-aligned data, null cells for zero-volume +/// shapes, monotonic slab layout, slab survives factory scope. + +#include "TiledArray/tensor/arena_kernels.h" + +#include "TiledArray/tensor.h" +#include "TiledArray/tensor/arena_einsum.h" +#include "tiledarray.h" +#include "unit_test_config.h" + +#include + +#include +#include +#include + +namespace TA = TiledArray; +using Inner = TA::ArenaTensor; +using Outer = TA::Tensor; + +BOOST_AUTO_TEST_SUITE(arena_tensor_kernels_suite, TA_UT_LABEL_SERIAL) + +BOOST_AUTO_TEST_CASE(builds_outer_with_uniform_inners) { + TA::Range outer_r{4}; + auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{8}; }; + Outer outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + BOOST_REQUIRE_EQUAL(outer.range().volume(), 4u); + for (std::size_t ord = 0; ord < 4; ++ord) { + Inner& inner = outer.data()[ord]; + BOOST_CHECK(bool(inner)); + BOOST_CHECK_EQUAL(inner.size(), 8u); + auto addr = reinterpret_cast(inner.data()); + BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + } +} + +BOOST_AUTO_TEST_CASE(zero_volume_shapes_yield_null_inners) { + TA::Range outer_r{4}; + auto shape_fn = [](std::size_t ord) { + return ord % 2 == 0 ? TA::Range{4} : TA::Range(); + }; + Outer outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + for (std::size_t ord = 0; ord < 4; ++ord) { + Inner& inner = outer.data()[ord]; + if (ord % 2 == 0) { + BOOST_CHECK(bool(inner)); + BOOST_CHECK_EQUAL(inner.size(), 4u); + } else { + BOOST_CHECK(!inner); + } + } +} + +BOOST_AUTO_TEST_CASE(non_null_cells_share_one_monotonic_slab) { + TA::Range outer_r{6}; + auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{6}; }; + Outer outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + const double* prev_end = nullptr; + for (std::size_t ord = 0; ord < 6; ++ord) { + Inner& inner = outer.data()[ord]; + const double* begin = inner.data(); + const double* end = begin + inner.size(); + if (prev_end != nullptr) { + BOOST_CHECK(begin >= prev_end); + // Gap bounded by one cell stride (cache-line-floor or SIMD-driven). + const std::size_t gap = + static_cast(begin - prev_end) * sizeof(double); + BOOST_CHECK_LE(gap, + TA::detail::kArenaCachelineAlign + Inner::cell_size(0)); + } + prev_end = end; + } +} + +BOOST_AUTO_TEST_CASE(outer_outlives_factory_scope) { + Outer outer; + { + TA::Range outer_r{3}; + auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{4}; }; + outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + } + for (std::size_t ord = 0; ord < 3; ++ord) { + Inner& inner = outer.data()[ord]; + TA::fill(inner, double(ord + 1)); + } + for (std::size_t ord = 0; ord < 3; ++ord) { + Inner& inner = outer.data()[ord]; + for (std::size_t i = 0; i < inner.size(); ++i) + BOOST_CHECK_EQUAL(inner.data()[i], double(ord + 1)); + } +} + +BOOST_AUTO_TEST_CASE(jagged_inner_shapes_round_trip) { + TA::Range outer_r{4}; + std::vector sizes = {3, 5, 0, 7}; + auto shape_fn = [&](std::size_t ord) { + return sizes[ord] == 0 ? TA::Range() : TA::Range{sizes[ord]}; + }; + Outer outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + for (std::size_t ord = 0; ord < 4; ++ord) { + Inner& inner = outer.data()[ord]; + if (sizes[ord] == 0) { + BOOST_CHECK(!inner); + } else { + BOOST_REQUIRE(bool(inner)); + BOOST_CHECK_EQUAL(inner.size(), static_cast(sizes[ord])); + auto addr = reinterpret_cast(inner.data()); + BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + } + } +} + +BOOST_AUTO_TEST_CASE(empty_outer_range_yields_no_slab) { + TA::Range outer_r{0}; + auto shape_fn = [](std::size_t /*ord*/) { return TA::Range{4}; }; + Outer outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + BOOST_CHECK_EQUAL(outer.range().volume(), 0u); +} + +BOOST_AUTO_TEST_CASE(all_null_outer_works) { + TA::Range outer_r{5}; + auto shape_fn = [](std::size_t /*ord*/) { return TA::Range(); }; + Outer outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + for (std::size_t ord = 0; ord < 5; ++ord) BOOST_CHECK(!outer.data()[ord]); +} + +namespace { + +/// Build an outer with uniform inners filled by an ordinal-dependent rule. +Outer make_outer(std::size_t n_outer, std::size_t n_inner, double base) { + TA::Range outer_r{static_cast(n_outer)}; + auto shape_fn = [n_inner](std::size_t /*ord*/) { + return TA::Range{static_cast(n_inner)}; + }; + Outer outer = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + for (std::size_t ord = 0; ord < n_outer; ++ord) { + Inner& inner = outer.data()[ord]; + for (std::size_t i = 0; i < inner.size(); ++i) + inner.data()[i] = base + ord * 100.0 + i; + } + return outer; +} + +bool outers_equal(const Outer& a, const Outer& b) { + if (a.range().volume() != b.range().volume()) return false; + for (std::size_t ord = 0; ord < a.range().volume(); ++ord) { + const Inner& ai = a.data()[ord]; + const Inner& bi = b.data()[ord]; + if (bool(ai) != bool(bi)) return false; + if (!ai) continue; + if (ai.size() != bi.size()) return false; + for (std::size_t i = 0; i < ai.size(); ++i) + if (ai.data()[i] != bi.data()[i]) return false; + } + return true; +} + +} // namespace + +BOOST_AUTO_TEST_CASE(arena_tensor_is_a_tensor_but_a_view) { + // ArenaTensor is registered as is_tensor_helper / is_contiguous_tensor so + // kernel paths treat it like Tensor; the `is_tensor_view` trait + // opts it out of value-returning member-call paths (which require + // allocation a view cannot do). + static_assert(TA::detail::is_tensor_helper::value); + static_assert(TA::detail::is_contiguous_tensor::value); + static_assert(TA::detail::is_tensor::value); + static_assert(TA::is_tensor_view_v); + static_assert(TA::is_arena_tensor_v); + // ta_ops_match_tensor (value-returning ops gate) is now false for views. + static_assert(!TA::detail::ta_ops_match_tensor_v); + // ta_ops_match_tensor_inplace (in-place ops gate) is true. + static_assert(TA::detail::ta_ops_match_tensor_inplace_v); + BOOST_CHECK(true); +} + +BOOST_AUTO_TEST_CASE(trivial_clone_inner_round_trip) { + Outer src = make_outer(4, 5, 1.0); + Outer copy = src.clone(); + BOOST_CHECK(outers_equal(copy, src)); + // Independent slab: mutating copy doesn't affect src. + copy.data()[0].data()[0] = -1.0; + BOOST_CHECK_EQUAL(src.data()[0].data()[0], 1.0); +} + +BOOST_AUTO_TEST_CASE(trivial_scale_inner_multiplies) { + Outer src = make_outer(3, 4, 1.0); + Outer scaled = src.scale(2.5); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& sinner = src.data()[ord]; + const Inner& dinner = scaled.data()[ord]; + BOOST_REQUIRE_EQUAL(dinner.size(), sinner.size()); + for (std::size_t i = 0; i < sinner.size(); ++i) + BOOST_CHECK_EQUAL(dinner.data()[i], sinner.data()[i] * 2.5); + } +} + +BOOST_AUTO_TEST_CASE(trivial_add_inner_accumulates) { + Outer L = make_outer(3, 4, 1.0); + Outer R = make_outer(3, 4, 0.5); + Outer sum = L.add(R); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& l = L.data()[ord]; + const Inner& r = R.data()[ord]; + const Inner& d = sum.data()[ord]; + for (std::size_t i = 0; i < l.size(); ++i) + BOOST_CHECK_EQUAL(d.data()[i], l.data()[i] + r.data()[i]); + } +} + +BOOST_AUTO_TEST_CASE(trivial_subt_inner_subtracts) { + Outer L = make_outer(3, 4, 5.0); + Outer R = make_outer(3, 4, 1.0); + Outer diff = L.subt(R); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& l = L.data()[ord]; + const Inner& r = R.data()[ord]; + const Inner& d = diff.data()[ord]; + for (std::size_t i = 0; i < l.size(); ++i) + BOOST_CHECK_EQUAL(d.data()[i], l.data()[i] - r.data()[i]); + } +} + +BOOST_AUTO_TEST_CASE(trivial_mult_inner_elementwise) { + Outer L = make_outer(3, 4, 2.0); + Outer R = make_outer(3, 4, 0.5); + Outer prod = L.mult(R); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& l = L.data()[ord]; + const Inner& r = R.data()[ord]; + const Inner& d = prod.data()[ord]; + for (std::size_t i = 0; i < l.size(); ++i) + BOOST_CHECK_EQUAL(d.data()[i], l.data()[i] * r.data()[i]); + } +} + +BOOST_AUTO_TEST_CASE(contraction_arena_plan_reserve_and_construct_inner) { + // Verify ContractionArenaPlan's inner-tensor dispatch builds the right + // outer/inner shapes and SIMD-aligns each non-null inner cell. + TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + auto left = TA::detail::arena_outer_init(TA::Range{2, 3}, 1, + [](std::size_t /*ord*/) { + return TA::Range{4, 5}; + }); + auto right = TA::detail::arena_outer_init(TA::Range{3, 4}, 1, + [](std::size_t /*ord*/) { + return TA::Range{5, 6}; + }); + TA::detail::ArenaInnerShapePlan inner_plan{ + TA::detail::ArenaInnerShapeKind::gemm_result_range, + std::make_optional(inner_gh)}; + TA::detail::ContractionArenaPlan plan(inner_plan); + Outer result = plan.reserve_and_construct(left, right, outer_gh); + // Outer result: 2x4 = 8 cells; each inner: 4x6 = 24 elements. + BOOST_REQUIRE_EQUAL(result.range().volume(), 8u); + for (std::size_t ord = 0; ord < 8; ++ord) { + const Inner& inner = result.data()[ord]; + BOOST_REQUIRE(bool(inner)); + BOOST_CHECK_EQUAL(inner.size(), 24u); + auto addr = reinterpret_cast(inner.data()); + BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + } +} + +BOOST_AUTO_TEST_CASE(outer_gemm_with_arena_tensor_contraction) { + // End-to-end: arena-allocate result via the plan, then run TA::Tensor's + // outer gemm with a custom elem_muladd_op that calls the free gemm CPO + // for ArenaTensor inners. Verifies the full chain reserve_and_construct + // -> outer iteration -> inner BLAS gemm. + TA::math::GemmHelper outer_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + TA::math::GemmHelper inner_gh(TA::math::blas::NoTranspose, + TA::math::blas::NoTranspose, 2, 2, 2); + // A[2,3] outer of <4,5> inners (each 1.0); B[3,4] outer of <5,6> inners + // (each 2.0). C[2,4] outer of <4,6> inners; each entry = + // sum over outer k in [0,3) of sum over inner k in [0,5) of 1.0*2.0 + // = 3 * 5 * 2.0 = 30.0 + auto left = TA::detail::arena_outer_init(TA::Range{2, 3}, 1, + [](std::size_t /*ord*/) { + return TA::Range{4, 5}; + }); + auto right = TA::detail::arena_outer_init(TA::Range{3, 4}, 1, + [](std::size_t /*ord*/) { + return TA::Range{5, 6}; + }); + for (std::size_t i = 0; i < left.range().volume(); ++i) + TA::fill(left.data()[i], 1.0); + for (std::size_t i = 0; i < right.range().volume(); ++i) + TA::fill(right.data()[i], 2.0); + + TA::detail::ArenaInnerShapePlan inner_plan{ + TA::detail::ArenaInnerShapeKind::gemm_result_range, + std::make_optional(inner_gh)}; + TA::detail::ContractionArenaPlan plan(inner_plan); + Outer result = plan.reserve_and_construct(left, right, outer_gh); + + auto elem_muladd = [&inner_gh](Inner& r, const Inner& l, const Inner& rr) { + TA::gemm(r, l, rr, 1.0, inner_gh); + }; + result.gemm(left, right, outer_gh, elem_muladd); + + for (std::size_t ord = 0; ord < result.range().volume(); ++ord) { + const Inner& inner = result.data()[ord]; + BOOST_REQUIRE(bool(inner)); + BOOST_REQUIRE_EQUAL(inner.size(), 24u); + for (std::size_t e = 0; e < 24; ++e) + BOOST_CHECK_CLOSE(inner.data()[e], 30.0, 1e-12); + } +} + +BOOST_AUTO_TEST_CASE(trivial_ops_preserve_null_cells) { + // Outer with mixed null and non-null inners; trivial ops should propagate + // null cells through to the result. + TA::Range outer_r{4}; + auto shape_fn = [](std::size_t ord) { + return ord % 2 == 0 ? TA::Range{4} : TA::Range(); + }; + Outer src = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + for (std::size_t ord = 0; ord < 4; ++ord) { + Inner& inner = src.data()[ord]; + if (inner) { + for (std::size_t i = 0; i < inner.size(); ++i) inner.data()[i] = 1.0; + } + } + Outer scaled = src.scale(3.0); + for (std::size_t ord = 0; ord < 4; ++ord) { + const Inner& d = scaled.data()[ord]; + if (ord % 2 == 0) { + BOOST_REQUIRE(bool(d)); + for (std::size_t i = 0; i < d.size(); ++i) + BOOST_CHECK_EQUAL(d.data()[i], 3.0); + } else { + BOOST_CHECK(!d); + } + } +} + +// Outer-tile serialize round-trip: exercises the arena-aware path in +// TA::Tensor::serialize directly via an in-memory archive. The slab +// gets rebuilt on load. +BOOST_AUTO_TEST_CASE(outer_tile_serialize_round_trip_arena_tensor) { + // Build an outer with jagged inner shapes including one null cell. + Outer src = + TA::detail::arena_outer_init(TA::Range{4}, 1, [](std::size_t ord) { + if (ord == 2) return TA::Range(); // null cell + return TA::Range{static_cast(3 + ord)}; + }); + // Fill non-null cells with ord-dependent values. + for (std::size_t ord = 0; ord < 4; ++ord) { + Inner& cell = src.data()[ord]; + if (cell) { + for (std::size_t i = 0; i < cell.size(); ++i) + cell.data()[i] = double(ord * 100 + i); + } + } + + const std::size_t buf_size = 1 << 16; + std::vector buf(buf_size); + madness::archive::BufferOutputArchive oar(buf.data(), buf_size); + BOOST_REQUIRE_NO_THROW(oar & src); + const std::size_t nbyte = oar.size(); + oar.close(); + + Outer dst; + madness::archive::BufferInputArchive iar(buf.data(), nbyte); + BOOST_REQUIRE_NO_THROW(iar & dst); + iar.close(); + + // Verify outer shape, null/non-null flags, inner shapes, element values. + BOOST_REQUIRE_EQUAL(dst.range().volume(), src.range().volume()); + for (std::size_t ord = 0; ord < src.range().volume(); ++ord) { + const Inner& s = src.data()[ord]; + const Inner& d = dst.data()[ord]; + BOOST_REQUIRE_EQUAL(bool(s), bool(d)); + if (!s) continue; + BOOST_REQUIRE_EQUAL(s.size(), d.size()); + for (std::size_t i = 0; i < s.size(); ++i) + BOOST_CHECK_EQUAL(d.data()[i], s.data()[i]); + // The loaded cell's data pointer is SIMD-aligned via + // arena_outer_init. + auto addr = reinterpret_cast(d.data()); + BOOST_CHECK_EQUAL(addr % TA::kInnerSimdAlign, 0u); + } +} + +// DistArray-level test: forces `TA::DistArray>` to +// instantiate, exercising arena-aware serialization at the outer-tile +// boundary. Serial-only (no @distributed). +BOOST_AUTO_TEST_CASE(distarray_arena_tensor_construct_and_init_tiles) { + using Array = TA::DistArray; + auto& world = TA::get_default_world(); + TA::TiledRange tr{TA::TiledRange1{0, 2, 4}}; + Array A(world, tr); + A.init_tiles([](const TA::Range& tile_range) { + return TA::detail::arena_outer_init( + tile_range, 1, [](std::size_t /*ord*/) { return TA::Range{3}; }); + }); + world.gop.fence(); + BOOST_CHECK_EQUAL(A.trange().tiles_range().volume(), 2u); + if (A.is_local(0)) { + Outer tile = A.find(0).get(); + BOOST_CHECK_EQUAL(tile.range().volume(), 2u); + for (std::size_t i = 0; i < tile.range().volume(); ++i) { + const Inner& cell = tile.data()[i]; + BOOST_REQUIRE(bool(cell)); + BOOST_CHECK_EQUAL(cell.size(), 3u); + } + } +} + +// Mixed scalar/ArenaTensor outer Hadamard: each scalar-side outer cell +// multiplies the corresponding ArenaTensor-side inner element-wise. +// Exercises Tensor::mult(Tensor) and the symmetric +// Tensor::mult(Tensor). +BOOST_AUTO_TEST_CASE(mixed_outer_mult_scalar_times_arena) { + using Scalars = TA::Tensor; + // 3 outer cells, each inner of size 4, base value 1.0 + ord*100 + i. + Outer A = make_outer(3, 4, 1.0); + Scalars S(TA::Range{3}); + S.at_ordinal(0) = 2.0; + S.at_ordinal(1) = -1.5; + S.at_ordinal(2) = 0.25; + + // Tensor * Tensor + Outer prod_as = A.mult(S); + BOOST_REQUIRE_EQUAL(prod_as.range().volume(), 3u); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& a = A.data()[ord]; + const Inner& d = prod_as.data()[ord]; + BOOST_REQUIRE(bool(d)); + BOOST_REQUIRE_EQUAL(d.size(), a.size()); + // Result must be independent of the source slab. + BOOST_CHECK_NE(d.data(), a.data()); + for (std::size_t i = 0; i < a.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], a.data()[i] * S.at_ordinal(ord), 1e-12); + } + + // Tensor * Tensor + Outer prod_sa = S.mult(A); + BOOST_REQUIRE_EQUAL(prod_sa.range().volume(), 3u); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& a = A.data()[ord]; + const Inner& d = prod_sa.data()[ord]; + BOOST_REQUIRE(bool(d)); + BOOST_REQUIRE_EQUAL(d.size(), a.size()); + for (std::size_t i = 0; i < a.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], S.at_ordinal(ord) * a.data()[i], 1e-12); + } +} + +// Mixed mult preserves null cells coming from the arena side. +BOOST_AUTO_TEST_CASE(mixed_outer_mult_preserves_null_cells) { + using Scalars = TA::Tensor; + TA::Range outer_r{4}; + auto shape_fn = [](std::size_t ord) { + return ord % 2 == 0 ? TA::Range{4} : TA::Range(); + }; + Outer A = TA::detail::arena_outer_init(outer_r, 1, shape_fn); + for (std::size_t ord = 0; ord < 4; ++ord) { + Inner& inner = A.data()[ord]; + if (inner) + for (std::size_t i = 0; i < inner.size(); ++i) inner.data()[i] = 1.0; + } + Scalars S(TA::Range{4}); + S.at_ordinal(0) = 3.0; + S.at_ordinal(1) = 7.0; + S.at_ordinal(2) = -2.0; + S.at_ordinal(3) = 11.0; + + Outer prod = A.mult(S); + for (std::size_t ord = 0; ord < 4; ++ord) { + const Inner& d = prod.data()[ord]; + if (ord % 2 == 0) { + BOOST_REQUIRE(bool(d)); + for (std::size_t i = 0; i < d.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], 1.0 * S.at_ordinal(ord), 1e-12); + } else { + BOOST_CHECK(!d); + } + } +} + +// Mixed scalar/ArenaTensor add/subt: scalar broadcast across each inner. +BOOST_AUTO_TEST_CASE(mixed_outer_add_subt_scalar_and_arena) { + using Scalars = TA::Tensor; + Outer A = make_outer(3, 4, 1.0); + Scalars S(TA::Range{3}); + S.at_ordinal(0) = 10.0; + S.at_ordinal(1) = -2.0; + S.at_ordinal(2) = 0.5; + + // ToT + scalar → broadcast scalar across each inner element. + Outer sum_as = A.add(S); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& a = A.data()[ord]; + const Inner& d = sum_as.data()[ord]; + BOOST_REQUIRE(bool(d)); + for (std::size_t i = 0; i < a.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], a.data()[i] + S.at_ordinal(ord), 1e-12); + } + // scalar + ToT → symmetric. + Outer sum_sa = S.add(A); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& a = A.data()[ord]; + const Inner& d = sum_sa.data()[ord]; + BOOST_REQUIRE(bool(d)); + for (std::size_t i = 0; i < a.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], S.at_ordinal(ord) + a.data()[i], 1e-12); + } + // ToT - scalar → subtract per-cell scalar. + Outer diff_as = A.subt(S); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& a = A.data()[ord]; + const Inner& d = diff_as.data()[ord]; + BOOST_REQUIRE(bool(d)); + for (std::size_t i = 0; i < a.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], a.data()[i] - S.at_ordinal(ord), 1e-12); + } + // scalar - ToT. + Outer diff_sa = S.subt(A); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& a = A.data()[ord]; + const Inner& d = diff_sa.data()[ord]; + BOOST_REQUIRE(bool(d)); + for (std::size_t i = 0; i < a.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], S.at_ordinal(ord) - a.data()[i], 1e-12); + } +} + +// `Tensor` should support the same reductions as a flat tensor +// (sum / product / squared_norm / min / max), routed through TA's ToT +// reduce path via the `is_tensor_of_tensor_helper` extension. +// Sanity-check the trait flip: +// - is_arena_tensor_v> must be true +// - is_tensor_of_tensor_v> must be true (was false) +// - is_tensor_v> must be false (was true) +static_assert(TA::is_arena_tensor_v); +static_assert(TA::detail::is_tensor_of_tensor_v); +static_assert(!TA::detail::is_tensor_v); +// And view-aware in-place ops must work for Tensor. +// Confirm prerequisite traits hold: +static_assert(TA::is_tensor_view_v, "ArenaTensor must be a tensor view"); +static_assert(TA::is_tensor_view_v, + "Outer's value_type (ArenaTensor) must be a tensor view"); + +// Spot-check that the legacy in-place ops which use `is_tensor` +// SFINAE *do not* match for `Tensor` after the trait flip. +// If they did, instantiating them would fail (no operator-= on +// ArenaTensor). Probe via `has_member_function_subt_to_anyreturn_v`. + +// Smoke test: in-place ops on Tensor compile and execute. +BOOST_AUTO_TEST_CASE(tot_inplace_ops_smoketest) { + Outer a = TA::detail::arena_outer_init( + TA::Range{2}, 1, [](std::size_t) { return TA::Range{3}; }); + for (std::size_t ord = 0; ord < 2; ++ord) + for (std::size_t i = 0; i < 3; ++i) a.data()[ord].data()[i] = 1.0; + Outer b = TA::detail::arena_outer_init( + TA::Range{2}, 1, [](std::size_t) { return TA::Range{3}; }); + for (std::size_t ord = 0; ord < 2; ++ord) + for (std::size_t i = 0; i < 3; ++i) b.data()[ord].data()[i] = 2.0; + a.add_to(b); // expect 3.0 elements + for (std::size_t ord = 0; ord < 2; ++ord) + for (std::size_t i = 0; i < 3; ++i) + BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 3.0, 1e-12); + a.subt_to(b); // back to 1.0 + for (std::size_t ord = 0; ord < 2; ++ord) + for (std::size_t i = 0; i < 3; ++i) + BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 1.0, 1e-12); + a.mult_to(b); // 2.0 + for (std::size_t ord = 0; ord < 2; ++ord) + for (std::size_t i = 0; i < 3; ++i) + BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 2.0, 1e-12); + a.scale_to(0.5); // 1.0 + for (std::size_t ord = 0; ord < 2; ++ord) + for (std::size_t i = 0; i < 3; ++i) + BOOST_CHECK_CLOSE(a.data()[ord].data()[i], 1.0, 1e-12); + a.neg_to(); // -1.0 + for (std::size_t ord = 0; ord < 2; ++ord) + for (std::size_t i = 0; i < 3; ++i) + BOOST_CHECK_CLOSE(a.data()[ord].data()[i], -1.0, 1e-12); +} + +BOOST_AUTO_TEST_CASE(tot_reductions_match_flat_aggregate) { + using Inner = TA::ArenaTensor; + using Outer = TA::Tensor; + Outer a = TA::detail::arena_outer_init( + TA::Range{3}, 1, [](std::size_t /*ord*/) { return TA::Range{4}; }); + double expected_sum = 0.0; + double expected_product = 1.0; + double expected_sq_norm = 0.0; + for (std::size_t ord = 0; ord < 3; ++ord) { + Inner& inner = a.data()[ord]; + for (std::size_t i = 0; i < inner.size(); ++i) { + const double v = 1.0 + ord * 10.0 + i; // deterministic, all positive + inner.data()[i] = v; + expected_sum += v; + expected_product *= v; + expected_sq_norm += v * v; + } + } + BOOST_CHECK_CLOSE(a.sum(), expected_sum, 1e-12); + BOOST_CHECK_CLOSE(a.product(), expected_product, 1e-12); + BOOST_CHECK_CLOSE(a.squared_norm(), expected_sq_norm, 1e-12); + BOOST_CHECK_CLOSE(a.norm(), std::sqrt(expected_sq_norm), 1e-12); +} + +// axpy_to on Tensor: verifies axpy semantics +// (factor scales only the added operand, not the existing result) — +// distinct from add_to(right, factor) which is `(result + right) * factor`. +BOOST_AUTO_TEST_CASE(tot_axpy_to_accumulates_scaled_operand) { + Outer result = make_outer(3, 4, 10.0); + std::vector> initial(3, std::vector(4)); + for (std::size_t ord = 0; ord < 3; ++ord) + for (std::size_t i = 0; i < 4; ++i) + initial[ord][i] = result.data()[ord].data()[i]; + Outer arg = make_outer(3, 4, 1.0); + const double factor = 0.5; + using TiledArray::axpy_to; + axpy_to(result, arg, factor); + for (std::size_t ord = 0; ord < 3; ++ord) { + const Inner& a = arg.data()[ord]; + const Inner& d = result.data()[ord]; + for (std::size_t i = 0; i < a.size(); ++i) + BOOST_CHECK_CLOSE(d.data()[i], initial[ord][i] + a.data()[i] * factor, + 1e-12); + } +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/arena_tot_trivial.cpp b/tests/arena_tot_trivial.cpp new file mode 100644 index 0000000000..627bd5a7cc --- /dev/null +++ b/tests/arena_tot_trivial.cpp @@ -0,0 +1,144 @@ +/// Arena-aware ToT trivial-op end-to-end tests (add, subt, mult, scale, clone). + +#include "TiledArray/tensor.h" +#include "tiledarray.h" +#include "unit_test_config.h" + +#include +#include +#include + +namespace TA = TiledArray; +using inner_t = TA::Tensor; +using outer_t = TA::Tensor; + +namespace { + +outer_t make_tot(std::size_t N_outer, std::size_t n_inner, double base = 1.0) { + outer_t outer(TA::Range{static_cast(N_outer)}, 1); + for (std::size_t ord = 0; ord < N_outer; ++ord) { + inner_t inner(TA::Range{static_cast(n_inner)}); + for (std::size_t i = 0; i < n_inner; ++i) + inner.at_ordinal(i) = base + ord * 100.0 + i; + *(outer.data() + ord) = std::move(inner); + } + return outer; +} + +bool tot_equal(const outer_t& a, const outer_t& b) { + if (a.range().volume() != b.range().volume()) return false; + for (std::size_t ord = 0; ord < a.range().volume(); ++ord) { + const inner_t& ai = *(a.data() + ord); + const inner_t& bi = *(b.data() + ord); + if (ai.range().volume() != bi.range().volume()) return false; + for (std::size_t i = 0; i < ai.range().volume(); ++i) + if (ai.at_ordinal(i) != bi.at_ordinal(i)) return false; + } + return true; +} + +/// All inner cells point into one contiguous slab (monotonic with bounded gap). +bool inners_share_one_slab(const outer_t& tot) { + if (tot.range().volume() == 0) return true; + const double* prev_end = nullptr; + for (std::size_t ord = 0; ord < tot.range().volume(); ++ord) { + const inner_t& cell = *(tot.data() + ord); + if (cell.range().volume() == 0) continue; + const double* cell_begin = cell.data(); + const double* cell_end = cell_begin + cell.range().volume(); + if (prev_end != nullptr && cell_begin < prev_end) return false; + if (prev_end != nullptr && + static_cast(cell_begin - prev_end) > 1024) + return false; + prev_end = cell_end; + } + return true; +} + +} + +BOOST_AUTO_TEST_SUITE(arena_tot_trivial_suite, TA_UT_LABEL_SERIAL) + +BOOST_AUTO_TEST_CASE(scale_bit_equal_and_one_slab) { + outer_t src = make_tot(6, 8, 1.0); + outer_t arena_result = src.scale(2.5); + outer_t baseline(src.range(), 1); + for (std::size_t ord = 0; ord < src.range().volume(); ++ord) { + inner_t inner((src.data() + ord)->range()); + for (std::size_t i = 0; i < inner.range().volume(); ++i) + inner.at_ordinal(i) = (src.data() + ord)->at_ordinal(i) * 2.5; + *(baseline.data() + ord) = std::move(inner); + } + BOOST_CHECK(tot_equal(arena_result, baseline)); + BOOST_CHECK(inners_share_one_slab(arena_result)); +} + +BOOST_AUTO_TEST_CASE(clone_bit_equal_and_one_slab) { + outer_t src = make_tot(6, 8, 3.0); + outer_t arena_result = src.clone(); + BOOST_CHECK(tot_equal(arena_result, src)); + BOOST_CHECK(inners_share_one_slab(arena_result)); +} + +BOOST_AUTO_TEST_CASE(add_bit_equal_and_one_slab) { + outer_t L = make_tot(6, 8, 1.0); + outer_t R = make_tot(6, 8, 0.5); + outer_t arena_result = L.add(R); + outer_t baseline(L.range(), 1); + for (std::size_t ord = 0; ord < L.range().volume(); ++ord) { + inner_t inner((L.data() + ord)->range()); + for (std::size_t i = 0; i < inner.range().volume(); ++i) + inner.at_ordinal(i) = (L.data() + ord)->at_ordinal(i) + + (R.data() + ord)->at_ordinal(i); + *(baseline.data() + ord) = std::move(inner); + } + BOOST_CHECK(tot_equal(arena_result, baseline)); + BOOST_CHECK(inners_share_one_slab(arena_result)); +} + +BOOST_AUTO_TEST_CASE(subt_bit_equal_and_one_slab) { + outer_t L = make_tot(6, 8, 5.0); + outer_t R = make_tot(6, 8, 1.0); + outer_t arena_result = L.subt(R); + outer_t baseline(L.range(), 1); + for (std::size_t ord = 0; ord < L.range().volume(); ++ord) { + inner_t inner((L.data() + ord)->range()); + for (std::size_t i = 0; i < inner.range().volume(); ++i) + inner.at_ordinal(i) = (L.data() + ord)->at_ordinal(i) - + (R.data() + ord)->at_ordinal(i); + *(baseline.data() + ord) = std::move(inner); + } + BOOST_CHECK(tot_equal(arena_result, baseline)); + BOOST_CHECK(inners_share_one_slab(arena_result)); +} + +BOOST_AUTO_TEST_CASE(mult_elementwise_bit_equal_and_one_slab) { + outer_t L = make_tot(6, 8, 2.0); + outer_t R = make_tot(6, 8, 0.5); + outer_t arena_result = L.mult(R); + outer_t baseline(L.range(), 1); + for (std::size_t ord = 0; ord < L.range().volume(); ++ord) { + inner_t inner((L.data() + ord)->range()); + for (std::size_t i = 0; i < inner.range().volume(); ++i) + inner.at_ordinal(i) = (L.data() + ord)->at_ordinal(i) * + (R.data() + ord)->at_ordinal(i); + *(baseline.data() + ord) = std::move(inner); + } + BOOST_CHECK(tot_equal(arena_result, baseline)); + BOOST_CHECK(inners_share_one_slab(arena_result)); +} + +BOOST_AUTO_TEST_CASE(arena_outlives_source) { + outer_t arena_result; + { + outer_t src = make_tot(3, 4, 9.0); + arena_result = src.scale(2.0); + } + for (std::size_t ord = 0; ord < arena_result.range().volume(); ++ord) + for (std::size_t i = 0; i < (arena_result.data() + ord)->range().volume(); + ++i) + BOOST_CHECK_EQUAL((arena_result.data() + ord)->at_ordinal(i), + (9.0 + ord * 100.0 + i) * 2.0); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/cases/CMakeLists.txt b/tests/cases/CMakeLists.txt new file mode 100644 index 0000000000..8cc4721163 --- /dev/null +++ b/tests/cases/CMakeLists.txt @@ -0,0 +1,15 @@ +# hec_* + 4d_e per-cell case binaries (arena vs heap). + +set(_cases + case_hec_h + case_hec_e + case_hec_ec + case_hec_scale + case_4d_e +) + +foreach(_case ${_cases}) + add_ta_executable(${_case} "${_case}.cpp" "tiledarray") + target_include_directories(${_case} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + add_dependencies(test-cases-tiledarray ${_case}) +endforeach() diff --git a/tests/cases/case_4d_e.cpp b/tests/cases/case_4d_e.cpp new file mode 100644 index 0000000000..df1f47b6ea --- /dev/null +++ b/tests/cases/case_4d_e.cpp @@ -0,0 +1,61 @@ +/// 4d_e: outer-4D x outer-3D with one Hadamard, one contracted, three free. + +#include "case_common.h" + +#include + +namespace c = cases; + +namespace { + +/// Deterministic truncated-exponential inner-size, mean ~10, cap 50. +inline long a_size(long p, long q) { + unsigned long h = + (static_cast(p) * 73ULL + + static_cast(q) * 113ULL + 17ULL) * 2654435761ULL; + double u = static_cast(h & 0x7FFFFFFFUL) / + static_cast(0x80000000UL); + double x = -10.0 * std::log(1.0 - u); + if (x > 50.0) x = 50.0; + return static_cast(x); +} + +} // namespace + +struct Ops { + c::ToT lhs; + c::ToT rhs; +}; + +int main(int argc, char** argv) { + constexpr int I = 20; + constexpr int M = 50; + constexpr int K = 100; + + auto sl = [](long q, long p, long /*m*/, long /*k*/) { + return TiledArray::Range{a_size(p, q)}; + }; + auto sr = [](long r, long q, long /*m*/) { + return TiledArray::Range{a_size(q, r)}; + }; + + return c::run_case_main_split( + argc, argv, "4d_e", + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_4d_jagged(w, I, I, M, K, 1.0, sl); + ops.rhs = c::make_tot_3d_jagged(w, I, I, M, 100.0, sr); + return ops; + }, + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_4d_jagged_slab(w, I, I, M, K, 1.0, sl); + ops.rhs = c::make_tot_3d_jagged_slab(w, I, I, M, 100.0, sr); + return ops; + }, + [&](Ops& ops) { + return TiledArray::einsum(ops.lhs("q,p,m,k;s"), + ops.rhs("r,q,m;t"), + "p,r,q,k;s,t"); + }); +} diff --git a/tests/cases/case_common.h b/tests/cases/case_common.h new file mode 100644 index 0000000000..b097eb56cd --- /dev/null +++ b/tests/cases/case_common.h @@ -0,0 +1,528 @@ +/// Shared bench helpers for arena-vs-heap case binaries. + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cases { + +namespace TA = ::TiledArray; + +using inner_t = TA::Tensor; +using tile_t = TA::Tensor; +using ToT = TA::DistArray; +using Plain = TA::DistArray; + +inline int& g_tile_grid() { + static int v = 7; + return v; +} + +/// Stores the h-dimension scale set by --h-scale. +inline int& g_h_scale() { + static int v = 1; + return v; +} + +inline std::vector tile_breaks(int n, int ntiles) { + if (ntiles <= 1 || n <= 0) + return {0, static_cast(std::max(n, 0))}; + std::vector b; + b.reserve(ntiles + 1); + const int chunk = n / ntiles; + for (int t = 0; t < ntiles; ++t) { + b.push_back(static_cast(t * chunk)); + } + b.push_back(static_cast(n)); + std::vector uniq; + for (auto x : b) + if (uniq.empty() || uniq.back() != x) uniq.push_back(x); + return uniq; +} + +inline TA::TiledRange1 tr1_dim(int n) { + auto b = tile_breaks(n, g_tile_grid()); + return TA::TiledRange1(b.begin(), b.end()); +} + +inline TA::TiledRange tr3(int a, int b, int c) { + return TA::TiledRange{tr1_dim(a), tr1_dim(b), tr1_dim(c)}; +} + +inline TA::TiledRange tr4(int a, int b, int c, int d) { + return TA::TiledRange{tr1_dim(a), tr1_dim(b), tr1_dim(c), tr1_dim(d)}; +} + +/// Builds a 3-D slab-backed jagged ToT. +template +ToT make_tot_3d_jagged_slab(TA::World& world, int A, int B, int C, + double offset, Fn inner_fn) { + ToT out(world, tr3(A, B, C)); + out.init_tiles([offset, inner_fn](const TA::Range& tile_range) { + const std::size_t n_cells = tile_range.volume(); + std::vector ranges; + ranges.reserve(n_cells); + std::vector cell_offsets(n_cells); + std::size_t total_elems = 0; + { + std::size_t ord = 0; + for (auto outer_idx : tile_range) { + const long o0 = static_cast(outer_idx[0]); + const long o1 = static_cast(outer_idx[1]); + const long o2 = static_cast(outer_idx[2]); + TA::Range ir = inner_fn(o0, o1, o2); + cell_offsets[ord] = total_elems; + const std::size_t vol = ir.volume(); + const std::size_t padded = (vol + 7) & ~std::size_t{7}; + total_elems += padded; + ranges.push_back(std::move(ir)); + ++ord; + } + } + + std::shared_ptr slab; + if (total_elems > 0) { + void* raw = nullptr; + if (posix_memalign(&raw, 64, total_elems * sizeof(double)) != 0) { + std::abort(); + } + slab = std::shared_ptr(static_cast(raw), + [](double* p) { std::free(p); }); + } + + tile_t tile(tile_range); + std::size_t ord = 0; + for (auto outer_idx : tile_range) { + const long o0 = static_cast(outer_idx[0]); + const long o1 = static_cast(outer_idx[1]); + const long o2 = static_cast(outer_idx[2]); + auto& ir = ranges[ord]; + const std::size_t vol = ir.volume(); + if (vol == 0) { + *(tile.data() + ord) = inner_t{}; + } else { + std::shared_ptr alias(slab, + slab.get() + cell_offsets[ord]); + for (std::size_t k = 0; k < vol; ++k) + alias[k] = offset + 1e-4 * static_cast( + o0 * 100000 + o1 * 1000 + o2 * 100 + k); + *(tile.data() + ord) = inner_t(ir, std::move(alias)); + } + ++ord; + } + return tile; + }); + world.gop.fence(); + return out; +} + +/// Builds a 3-D heap-scattered jagged ToT. +template +ToT make_tot_3d_jagged(TA::World& world, int A, int B, int C, double offset, + Fn inner_fn) { + ToT out(world, tr3(A, B, C)); + out.init_tiles([offset, inner_fn](const TA::Range& tile_range) { + tile_t tile(tile_range); + std::size_t ord = 0; + for (auto outer_idx : tile_range) { + const long o0 = static_cast(outer_idx[0]); + const long o1 = static_cast(outer_idx[1]); + const long o2 = static_cast(outer_idx[2]); + TA::Range ir = inner_fn(o0, o1, o2); + const std::size_t vol = ir.volume(); + if (vol == 0) { + *(tile.data() + ord) = inner_t{}; + } else { + inner_t inner(ir); + for (std::size_t k = 0; k < vol; ++k) + inner.at_ordinal(k) = + offset + 1e-4 * static_cast( + o0 * 100000 + o1 * 1000 + o2 * 100 + k); + *(tile.data() + ord) = std::move(inner); + } + ++ord; + } + return tile; + }); + world.gop.fence(); + return out; +} + +/// Builds a 4-D slab-backed jagged ToT. +template +ToT make_tot_4d_jagged_slab(TA::World& world, int A, int B, int C, int D, + double offset, Fn inner_fn) { + ToT out(world, tr4(A, B, C, D)); + out.init_tiles([offset, inner_fn](const TA::Range& tile_range) { + const std::size_t n_cells = tile_range.volume(); + std::vector ranges; + ranges.reserve(n_cells); + std::vector cell_offsets(n_cells); + std::size_t total_elems = 0; + { + std::size_t ord = 0; + for (auto outer_idx : tile_range) { + const long o0 = static_cast(outer_idx[0]); + const long o1 = static_cast(outer_idx[1]); + const long o2 = static_cast(outer_idx[2]); + const long o3 = static_cast(outer_idx[3]); + TA::Range ir = inner_fn(o0, o1, o2, o3); + cell_offsets[ord] = total_elems; + const std::size_t vol = ir.volume(); + const std::size_t padded = (vol + 7) & ~std::size_t{7}; + total_elems += padded; + ranges.push_back(std::move(ir)); + ++ord; + } + } + std::shared_ptr slab; + if (total_elems > 0) { + void* raw = nullptr; + if (posix_memalign(&raw, 64, total_elems * sizeof(double)) != 0) { + std::abort(); + } + slab = std::shared_ptr(static_cast(raw), + [](double* p) { std::free(p); }); + } + tile_t tile(tile_range); + std::size_t ord = 0; + for (auto outer_idx : tile_range) { + const long o0 = static_cast(outer_idx[0]); + const long o1 = static_cast(outer_idx[1]); + const long o2 = static_cast(outer_idx[2]); + const long o3 = static_cast(outer_idx[3]); + auto& ir = ranges[ord]; + const std::size_t vol = ir.volume(); + if (vol == 0) { + *(tile.data() + ord) = inner_t{}; + } else { + std::shared_ptr alias(slab, + slab.get() + cell_offsets[ord]); + for (std::size_t k = 0; k < vol; ++k) + alias[k] = offset + 1e-4 * static_cast( + o0 * 1000000 + o1 * 10000 + + o2 * 100 + o3 * 10 + k); + *(tile.data() + ord) = inner_t(ir, std::move(alias)); + } + ++ord; + } + return tile; + }); + world.gop.fence(); + return out; +} + +/// Builds a 4-D heap-scattered jagged ToT. +template +ToT make_tot_4d_jagged(TA::World& world, int A, int B, int C, int D, + double offset, Fn inner_fn) { + ToT out(world, tr4(A, B, C, D)); + out.init_tiles([offset, inner_fn](const TA::Range& tile_range) { + tile_t tile(tile_range); + std::size_t ord = 0; + for (auto outer_idx : tile_range) { + const long o0 = static_cast(outer_idx[0]); + const long o1 = static_cast(outer_idx[1]); + const long o2 = static_cast(outer_idx[2]); + const long o3 = static_cast(outer_idx[3]); + TA::Range ir = inner_fn(o0, o1, o2, o3); + const std::size_t vol = ir.volume(); + if (vol == 0) { + *(tile.data() + ord) = inner_t{}; + } else { + inner_t inner(ir); + for (std::size_t k = 0; k < vol; ++k) + inner.at_ordinal(k) = + offset + 1e-4 * static_cast( + o0 * 1000000 + o1 * 10000 + o2 * 100 + + o3 * 10 + k); + *(tile.data() + ord) = std::move(inner); + } + ++ord; + } + return tile; + }); + world.gop.fence(); + return out; +} + +inline Plain make_plain_3d(TA::World& world, int A, int B, int C, + double offset) { + Plain out(world, tr3(A, B, C)); + out.init_tiles([offset](const TA::Range& r) { + inner_t tile(r); + for (std::size_t k = 0; k < r.volume(); ++k) + tile.at_ordinal(k) = offset + 1e-3 * static_cast(k); + return tile; + }); + world.gop.fence(); + return out; +} + +inline double max_abs_diff(const ToT& a, const ToT& b) { + if (a.trange() != b.trange()) return 1e30; + double mx = 0.0; + const auto& tr = a.trange(); + for (auto t = tr.tiles_range().begin(); t != tr.tiles_range().end(); ++t) { + if (!a.is_local(*t)) continue; + auto ta = a.find(*t).get(); + auto tb = b.find(*t).get(); + if (ta.range().volume() != tb.range().volume()) return 1e30; + for (std::size_t ord = 0; ord < ta.range().volume(); ++ord) { + const auto& ia = *(ta.data() + ord); + const auto& ib = *(tb.data() + ord); + if (ia.range().volume() != ib.range().volume()) { + if (ia.range().volume() == 0 || ib.range().volume() == 0) { + mx = std::max(mx, 1.0); + continue; + } + return 1e30; + } + for (std::size_t k = 0; k < ia.range().volume(); ++k) { + double d = std::abs(ia.at_ordinal(k) - ib.at_ordinal(k)); + if (d > mx) mx = d; + } + } + } + return mx; +} + +struct RunResult { + double wall_ns_min = 0.0; + double wall_ns_med = 0.0; + ToT result; + bool ok = true; + std::string err; +}; + +template +RunResult time_run(TA::World& world, Runner&& run, bool disable_arena, + int repeats) { + RunResult R; + std::vector ns; + ns.reserve(repeats); + for (int r = 0; r < repeats; ++r) { + TA::detail::arena_disabled() = disable_arena; + world.gop.fence(); + auto t0 = std::chrono::steady_clock::now(); + try { + R.result = run(); + world.gop.fence(); + } catch (std::exception& e) { + R.ok = false; + R.err = e.what(); + return R; + } catch (...) { + R.ok = false; + R.err = "unknown"; + return R; + } + auto t1 = std::chrono::steady_clock::now(); + ns.push_back( + std::chrono::duration_cast(t1 - t0).count()); + } + std::sort(ns.begin(), ns.end()); + R.wall_ns_min = ns.front(); + R.wall_ns_med = ns[ns.size() / 2]; + return R; +} + +/// Runs a case binary by building operands once and timing one mode. +template +int run_case_main(int argc, char** argv, const char* case_name, Build build, + Run run) { + // Heap and arena timings must run in separate processes to avoid allocator/cache bias. + std::string mode; + int repeats = 3; + bool quiet = false; + int tile_grid = 7; + for (int i = 1; i < argc; ++i) { + std::string a = argv[i]; + if (a == "--mode" && i + 1 < argc) { + mode = argv[++i]; + } else if (a == "--repeat" && i + 1 < argc) { + repeats = std::atoi(argv[++i]); + } else if (a == "--tile-grid" && i + 1 < argc) { + tile_grid = std::max(1, std::atoi(argv[++i])); + } else if (a == "--h-scale" && i + 1 < argc) { + g_h_scale() = std::max(1, std::atoi(argv[++i])); + } else if (a == "--quiet") { + quiet = true; + } else if (a == "-h" || a == "--help") { + std::cout + << "Usage: " << argv[0] + << " --mode {heap|arena} [--tile-grid G] [--h-scale S] " + "[--repeat R] [--quiet]\n" + "MAD_NUM_THREADS env var controls thread count.\n" + "Note: --mode is required. heap and arena MUST be benchmarked\n" + "in separate processes — running both in one process biases the\n" + "second run via allocator fragmentation and cache residue.\n"; + return 0; + } + } + if (mode != "heap" && mode != "arena") { + std::cerr << "error: --mode must be 'heap' or 'arena' (got '" + << mode << "')\n"; + return 2; + } + g_tile_grid() = tile_grid; + + TA::World& world = TA::initialize(argc, argv); + + const char* threads_env = std::getenv("MAD_NUM_THREADS"); + std::string threads_label = threads_env ? threads_env : "default"; + + std::cout << "case,mode,tile_grid,threads,wall_ns_min,wall_ns_med,verified\n"; + + if (!quiet) { + std::cerr << "# " << case_name << " tile_grid=" << tile_grid + << " h_scale=" << g_h_scale() + << " threads=" << threads_label << "\n"; + } + + auto operands = build(world); + + auto emit = [&](const char* m, const RunResult& R, const std::string& v) { + if (!R.ok) { + std::cout << case_name << "," << m << "," << tile_grid << "," + << threads_label << ",NA,NA,err:" << R.err << "\n"; + return; + } + std::cout << case_name << "," << m << "," << tile_grid << "," + << threads_label << "," << static_cast(R.wall_ns_min) + << "," << static_cast(R.wall_ns_med) << "," << v + << "\n"; + }; + + if (mode == "heap") { + auto Rh = time_run( + world, [&]() { return run(operands); }, true, + repeats); + emit("heap", Rh, "single"); + if (!quiet) { + std::cerr << " heap=" << Rh.wall_ns_med / 1e6 << "ms\n"; + } + } else { + auto Ra = time_run( + world, [&]() { return run(operands); }, false, + repeats); + emit("arena", Ra, "single"); + if (!quiet) { + std::cerr << " arena=" << Ra.wall_ns_med / 1e6 << "ms\n"; + } + } + + std::cout.flush(); + TA::detail::arena_disabled() = false; + TA::finalize(); + return 0; +} + +/// Runs a case binary with separate heap-scatter and arena-slab input builders. +template +int run_case_main_split(int argc, char** argv, const char* case_name, + BuildHeap build_heap, BuildArena build_arena, + Run run) { + // Heap and arena timings must run in separate processes to avoid allocator/cache bias. + std::string mode; + int repeats = 3; + bool quiet = false; + int tile_grid = 7; + for (int i = 1; i < argc; ++i) { + std::string a = argv[i]; + if (a == "--mode" && i + 1 < argc) { + mode = argv[++i]; + } else if (a == "--repeat" && i + 1 < argc) { + repeats = std::atoi(argv[++i]); + } else if (a == "--tile-grid" && i + 1 < argc) { + tile_grid = std::max(1, std::atoi(argv[++i])); + } else if (a == "--h-scale" && i + 1 < argc) { + g_h_scale() = std::max(1, std::atoi(argv[++i])); + } else if (a == "--quiet") { + quiet = true; + } else if (a == "-h" || a == "--help") { + std::cout << "Usage: " << argv[0] + << " --mode {heap|arena} [--tile-grid G] [--h-scale S] " + "[--repeat R] [--quiet]\n" + "Heap mode uses scattered (legacy) inputs; arena mode " + "uses slab-backed inputs.\n" + "Note: --mode is required. heap and arena MUST be " + "benchmarked in separate\n" + "processes — running both in one process biases the " + "second run via allocator\n" + "fragmentation and cache residue.\n"; + return 0; + } + } + if (mode != "heap" && mode != "arena") { + std::cerr << "error: --mode must be 'heap' or 'arena' (got '" + << mode << "')\n"; + return 2; + } + g_tile_grid() = tile_grid; + + TA::World& world = TA::initialize(argc, argv); + + const char* threads_env = std::getenv("MAD_NUM_THREADS"); + std::string threads_label = threads_env ? threads_env : "default"; + + std::cout << "case,mode,tile_grid,threads,wall_ns_min,wall_ns_med,verified\n"; + + if (!quiet) { + std::cerr << "# " << case_name << " tile_grid=" << tile_grid + << " h_scale=" << g_h_scale() + << " threads=" << threads_label + << " (split inputs: heap=scatter, arena=slab)\n"; + } + + auto emit = [&](const char* m, const RunResult& R, const std::string& v) { + if (!R.ok) { + std::cout << case_name << "," << m << "," << tile_grid << "," + << threads_label << ",NA,NA,err:" << R.err << "\n"; + return; + } + std::cout << case_name << "," << m << "," << tile_grid << "," + << threads_label << "," << static_cast(R.wall_ns_min) + << "," << static_cast(R.wall_ns_med) << "," << v + << "\n"; + }; + + if (mode == "heap") { + auto operands = build_heap(world); + auto Rh = time_run( + world, [&]() { return run(operands); }, true, + repeats); + emit("heap", Rh, "single"); + if (!quiet) std::cerr << " heap=" << Rh.wall_ns_med / 1e6 << "ms\n"; + } else { + auto operands = build_arena(world); + auto Ra = time_run( + world, [&]() { return run(operands); }, false, + repeats); + emit("arena", Ra, "single"); + if (!quiet) std::cerr << " arena=" << Ra.wall_ns_med / 1e6 << "ms\n"; + } + + std::cout.flush(); + TA::detail::arena_disabled() = false; + TA::finalize(); + return 0; +} + +} diff --git a/tests/cases/case_hec_e.cpp b/tests/cases/case_hec_e.cpp new file mode 100644 index 0000000000..83e767e4f2 --- /dev/null +++ b/tests/cases/case_hec_e.cpp @@ -0,0 +1,40 @@ +/// hec_e: A(h,i,j;m) * B(h,j,k;n) -> C(h,i,k;m,n); inner outer-product (i, k). + +#include "case_common.h" + +namespace c = cases; + +struct Ops { + c::ToT lhs; + c::ToT rhs; +}; + +int main(int argc, char** argv) { + constexpr int N = 30; + auto sl = [](long /*h*/, long i, long /*j*/) { + return TiledArray::Range{i}; + }; + auto sr = [](long /*h*/, long /*j*/, long k) { + return TiledArray::Range{k}; + }; + return c::run_case_main_split( + argc, argv, "hec_e", + [&](TiledArray::World& w) { + const int H = N * c::g_h_scale(); + Ops ops; + ops.lhs = c::make_tot_3d_jagged(w, H, N, N, 1.0, sl); + ops.rhs = c::make_tot_3d_jagged(w, H, N, N, 100.0, sr); + return ops; + }, + [&](TiledArray::World& w) { + const int H = N * c::g_h_scale(); + Ops ops; + ops.lhs = c::make_tot_3d_jagged_slab(w, H, N, N, 1.0, sl); + ops.rhs = c::make_tot_3d_jagged_slab(w, H, N, N, 100.0, sr); + return ops; + }, + [&](Ops& ops) { + return TiledArray::einsum(ops.lhs("h,i,j;m"), ops.rhs("h,j,k;n"), + "h,i,k;m,n"); + }); +} diff --git a/tests/cases/case_hec_ec.cpp b/tests/cases/case_hec_ec.cpp new file mode 100644 index 0000000000..857c8fcba8 --- /dev/null +++ b/tests/cases/case_hec_ec.cpp @@ -0,0 +1,38 @@ +/// hec_ec: A(h,i,j;m,p) * B(h,j,k;p,n) -> C(h,i,k;m,n); inner contracts p. + +#include "case_common.h" + +namespace c = cases; + +struct Ops { + c::ToT lhs; + c::ToT rhs; +}; + +int main(int argc, char** argv) { + constexpr int N = 60; + auto sl = [](long /*h*/, long i, long j) { + return TiledArray::Range{i, j}; + }; + auto sr = [](long /*h*/, long j, long k) { + return TiledArray::Range{j, k}; + }; + return c::run_case_main_split( + argc, argv, "hec_ec", + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_3d_jagged(w, N, N, N, 1.0, sl); + ops.rhs = c::make_tot_3d_jagged(w, N, N, N, 100.0, sr); + return ops; + }, + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_3d_jagged_slab(w, N, N, N, 1.0, sl); + ops.rhs = c::make_tot_3d_jagged_slab(w, N, N, N, 100.0, sr); + return ops; + }, + [&](Ops& ops) { + return TiledArray::einsum(ops.lhs("h,i,j;m,p"), ops.rhs("h,j,k;p,n"), + "h,i,k;m,n"); + }); +} diff --git a/tests/cases/case_hec_h.cpp b/tests/cases/case_hec_h.cpp new file mode 100644 index 0000000000..8b4da12071 --- /dev/null +++ b/tests/cases/case_hec_h.cpp @@ -0,0 +1,35 @@ +/// hec_h: A(h,i,j;m,n) * B(h,j,k;m,n) -> C(h,i,k;m,n); inner = (h, h). + +#include "case_common.h" + +namespace c = cases; + +struct Ops { + c::ToT lhs; + c::ToT rhs; +}; + +int main(int argc, char** argv) { + constexpr int N = 56; + auto sf = [](long h, long /*o1*/, long /*o2*/) { + return TiledArray::Range{h, h}; + }; + return c::run_case_main_split( + argc, argv, "hec_h", + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_3d_jagged(w, N, N, N, /*offset=*/1.0, sf); + ops.rhs = c::make_tot_3d_jagged(w, N, N, N, /*offset=*/100.0, sf); + return ops; + }, + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_3d_jagged_slab(w, N, N, N, /*offset=*/1.0, sf); + ops.rhs = c::make_tot_3d_jagged_slab(w, N, N, N, /*offset=*/100.0, sf); + return ops; + }, + [&](Ops& ops) { + return TiledArray::einsum(ops.lhs("h,i,j;m,n"), ops.rhs("h,j,k;m,n"), + "h,i,k;m,n"); + }); +} diff --git a/tests/cases/case_hec_scale.cpp b/tests/cases/case_hec_scale.cpp new file mode 100644 index 0000000000..34d399e323 --- /dev/null +++ b/tests/cases/case_hec_scale.cpp @@ -0,0 +1,35 @@ +/// hec_scale: A(h,i,j;m,n) * B_plain(h,j,k) -> C(h,i,k;m,n); inner scale. + +#include "case_common.h" + +namespace c = cases; + +struct Ops { + c::ToT lhs; + c::Plain rhs; +}; + +int main(int argc, char** argv) { + constexpr int N = 56; + auto sl = [](long /*h*/, long i, long /*j*/) { + return TiledArray::Range{i, i}; + }; + return c::run_case_main_split( + argc, argv, "hec_scale", + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_3d_jagged(w, N, N, N, 1.0, sl); + ops.rhs = c::make_plain_3d(w, N, N, N, 0.5); + return ops; + }, + [&](TiledArray::World& w) { + Ops ops; + ops.lhs = c::make_tot_3d_jagged_slab(w, N, N, N, 1.0, sl); + ops.rhs = c::make_plain_3d(w, N, N, N, 0.5); + return ops; + }, + [&](Ops& ops) { + return TiledArray::einsum(ops.lhs("h,i,j;m,n"), ops.rhs("h,j,k"), + "h,i,k;m,n"); + }); +} diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 6d32285de2..8bae61cf1f 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -237,6 +237,22 @@ BOOST_AUTO_TEST_CASE(equal_nested_ranks) { {{0, 2}, {0, 3}, {0, 2}}, // {3}, // {2})); + + // H+C;C with permuted inner operands -- no outer permutation, so this + // exercises the regime-A arena path; manual_eval is the independent oracle. + BOOST_REQUIRE(check_manual_eval("ijk;om,ijk;on->ij;nm", // + {{0, 2}, {0, 3}, {0, 2}}, // + {{0, 2}, {0, 3}, {0, 2}}, // + {3, 2}, // + {3, 2})); + + // H+C;H with a permuted inner Hadamard operand -- no outer permutation, so + // this exercises the regime-A arena Hadamard path against manual_eval. + BOOST_REQUIRE(check_manual_eval("ijk;mn,ijk;nm->ij;mn", // + {{0, 2}, {0, 3}, {0, 2}}, // + {{0, 2}, {0, 3}, {0, 2}}, // + {4, 3}, // + {3, 4})); // H+C;H+C not supported // H;C(op) diff --git a/tests/tot_construction.cpp b/tests/tot_construction.cpp new file mode 100644 index 0000000000..bfa3a9ee75 --- /dev/null +++ b/tests/tot_construction.cpp @@ -0,0 +1,728 @@ +/// Unified tensor-of-tensors construction: detail::make_nested_tile, +/// DistArray::init_tiles_nested, and the DistArray ToT range_fn constructor -- +/// exercised identically for TA::Tensor and ArenaTensor inner tiles. + +#include "TiledArray/einsum/tiledarray.h" +#include "TiledArray/tensor/arena_kernels.h" +#include "TiledArray/tensor/arena_tensor.h" +#include "tiledarray.h" + +#include "global_fixture.h" +#include "unit_test_config.h" + +#include +#include + +namespace { + +namespace TA = TiledArray; + +/// Deliberately non-uniform inner extent keyed on the outer element index. +inline long inner_extent(long e) { return 2 + (e % 3); } + +/// Build a rank-1 inner range of the inner tile's range type. +template +auto inner_range_for(const Index& idx) { + return + typename InnerTile::range_type{inner_extent(static_cast(idx[0]))}; +} + +/// Build a rank-2 (d0 x d1) inner range of the inner tile's range type. +/// Works for both TA::Range (TA::Tensor inner) and btas::zb::RangeNd +/// (ArenaTensor inner), which are both constructible from an extent vector. +template +auto inner_range_2d(std::size_t d0, std::size_t d1) { + return typename InnerTile::range_type(std::vector{d0, d1}); +} + +template +void verify_cell(const InnerTile& cell, long e, bool expect_filled) { + BOOST_REQUIRE(!cell.empty()); + BOOST_CHECK_EQUAL(static_cast(cell.size()), inner_extent(e)); + for (std::size_t i = 0; i < cell.size(); ++i) { + const double expect = expect_filled ? (100.0 * e + i) : 0.0; + BOOST_CHECK_EQUAL(cell.data()[i], expect); + } +} + +/// Fill an inner cell so element i of outer element e holds 100*e + i. +template +void fill_cell(Cell& cell, const Index& idx) { + const long e = static_cast(idx[0]); + for (std::size_t i = 0; i < cell.size(); ++i) cell.data()[i] = 100.0 * e + i; +} + +template +void test_make_nested_tile() { + using OuterTile = TA::Tensor; + const TA::Range outer{4}; + OuterTile tile = TA::detail::make_nested_tile( + outer, [](const auto& idx) { return inner_range_for(idx); }, + [](auto& cell, const auto& idx) { fill_cell(cell, idx); }); + BOOST_REQUIRE_EQUAL(tile.range().volume(), 4u); + for (long e = 0; e < 4; ++e) + verify_cell(tile.data()[e], e, /*expect_filled=*/true); +} + +template +void test_dist_array_tot_ctor() { + using Array = TA::DistArray, Policy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + // ToT range_fn ctor: shapes every inner cell, storage zero-initialized. + Array a(world, trange, + [](const auto& idx) { return inner_range_for(idx); }); + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + auto tile = a.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const long e = static_cast(tile.range().idx(ord)[0]); + verify_cell(tile.data()[ord], e, /*expect_filled=*/false); + } + } +} + +template +void test_init_tiles_nested() { + using Array = TA::DistArray, Policy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + Array a(world, trange); + a.init_tiles_nested( + [](const auto& idx) { return inner_range_for(idx); }, + [](auto& cell, const auto& idx) { fill_cell(cell, idx); }); + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + auto tile = a.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const long e = static_cast(tile.range().idx(ord)[0]); + verify_cell(tile.data()[ord], e, /*expect_filled=*/true); + } + } +} + +/// fill_random on an already-shaped ToT array is an in-place scalar mutator: +/// it overwrites every inner scalar while leaving the inner ranges intact. +template +void test_fill_random() { + using Array = TA::DistArray, Policy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + Array a(world, trange, + [](const auto& idx) { return inner_range_for(idx); }); + a.fill_random(); + double sum = 0.0; + std::size_t ncells = 0; + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + auto tile = a.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const long e = static_cast(tile.range().idx(ord)[0]); + const auto& cell = tile.data()[ord]; + // inner ranges must survive the in-place fill + BOOST_REQUIRE(!cell.empty()); + BOOST_CHECK_EQUAL(static_cast(cell.size()), inner_extent(e)); + for (std::size_t i = 0; i < cell.size(); ++i) sum += cell.data()[i]; + ++ncells; + } + } + BOOST_REQUIRE_GT(ncells, 0u); + // a random fill leaving every scalar exactly 0 is a measure-zero event + BOOST_CHECK_NE(sum, 0.0); +} + +/// init_elements drives the ToT constructor with an op that yields freestanding +/// owning inner tensors; for arena inners each outer tile collects the op +/// outputs, sizes one slab to fit, and deep-copies into the bound cells. +template +void test_init_elements() { + using Array = TA::DistArray, Policy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + Array a(world, trange); + a.init_elements([](const auto& idx) { + const long e = static_cast(idx[0]); + TA::Tensor t{TA::Range(inner_extent(e))}; + for (std::size_t i = 0; i < t.size(); ++i) t.data()[i] = 100.0 * e + i; + return t; + }); + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + auto tile = a.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const long e = static_cast(tile.range().idx(ord)[0]); + verify_cell(tile.data()[ord], e, /*expect_filled=*/true); + } + } +} + +/// fill on an already-shaped (uniform-extent) arena ToT deep-copies a +/// freestanding owning tensor into every bound inner cell. +void test_fill_arena() { + using InnerTile = TA::ArenaTensor; + using Array = TA::DistArray, TA::DensePolicy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + const long ext = 3; + Array a(world, trange, + [ext](const auto&) { return typename InnerTile::range_type{ext}; }); + TA::Tensor value{TA::Range(ext)}; + for (std::size_t i = 0; i < value.size(); ++i) value.data()[i] = 7.0 + i; + a.fill(value); + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + auto tile = a.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const auto& cell = tile.data()[ord]; + BOOST_REQUIRE(!cell.empty()); + BOOST_CHECK_EQUAL(static_cast(cell.size()), ext); + for (std::size_t i = 0; i < cell.size(); ++i) + BOOST_CHECK_EQUAL(cell.data()[i], 7.0 + i); + } + } +} + +/// set(i, value) populates a tile of an unshaped arena ToT array: every inner +/// cell is sized to `value`'s range and deep-copies its data. +void test_set_value_arena() { + using InnerTile = TA::ArenaTensor; + using Array = TA::DistArray, TA::DensePolicy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + const long ext = 3; + // harvest a populated inner cell from a shaped source array + Array src(world, trange, + [ext](const auto&) { return typename InnerTile::range_type{ext}; }); + InnerTile value; + for (const auto& tidx : src.trange().tiles_range()) { + if (!src.is_local(tidx)) continue; + auto tile = src.find(tidx).get(); + value = tile.data()[0]; // null -> rebind: view src's cell 0 + for (std::size_t i = 0; i < value.size(); ++i) value.data()[i] = 10.0 + i; + break; + } + BOOST_REQUIRE(!value.empty()); + // populate a fresh, unshaped array tile-by-tile + Array a(world, trange); + for (const auto& tidx : a.trange().tiles_range()) + if (a.is_local(tidx)) a.set(tidx, value); + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + auto tile = a.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const auto& cell = tile.data()[ord]; + BOOST_REQUIRE(!cell.empty()); + BOOST_CHECK_EQUAL(static_cast(cell.size()), ext); + for (std::size_t i = 0; i < cell.size(); ++i) + BOOST_CHECK_EQUAL(cell.data()[i], 10.0 + i); + } + } +} + +/// set(i, InIter) populates a tile from a sequence of freestanding owning +/// inner tensors; the slab is sized from their (possibly non-uniform) ranges. +void test_set_iter_arena() { + using InnerTile = TA::ArenaTensor; + using Array = TA::DistArray, TA::DensePolicy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + Array a(world, trange); + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + const auto tr = a.trange().make_tile_range(tidx); + std::vector> cells; + for (std::size_t ord = 0; ord < tr.volume(); ++ord) { + const long e = static_cast(tr.idx(ord)[0]); + TA::Tensor c{TA::Range(inner_extent(e))}; + for (std::size_t i = 0; i < c.size(); ++i) c.data()[i] = 100.0 * e + i; + cells.push_back(c); + } + a.set(tidx, cells.begin()); + } + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) continue; + auto tile = a.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const long e = static_cast(tile.range().idx(ord)[0]); + verify_cell(tile.data()[ord], e, /*expect_filled=*/true); + } + } +} + +/// Distributed: fetching an arena ToT tile owned by another rank transports +/// it via madness::archive, exercising Tensor's arena-aware +/// serialization end-to-end (slab marshalled out, rebuilt on the receiver). +void test_distributed_arena_tot() { + using InnerTile = TA::ArenaTensor; + using Array = TA::DistArray, TA::DensePolicy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4, 6, 8, 10, 12, 14}}; // 7 outer tiles + Array a(world, trange); + a.init_tiles_nested( + [](const auto& idx) { return inner_range_for(idx); }, + [](auto& cell, const auto& idx) { fill_cell(cell, idx); }); + world.gop.fence(); + std::size_t nremote = 0; + for (const auto& tidx : a.trange().tiles_range()) { + if (!a.is_local(tidx)) ++nremote; + auto tile = a.find(tidx).get(); // remote tile -> serialized transfer + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const long e = static_cast(tile.range().idx(ord)[0]); + verify_cell(tile.data()[ord], e, /*expect_filled=*/true); + } + } + // with >1 rank at least one tile must have been fetched (transported) here + if (world.size() > 1) BOOST_CHECK_GT(nremote, 0u); + world.gop.fence(); +} + +/// DistArray-level expression on a tensor-of-tensors, for plain and arena +/// inner tiles. `fill_a`/`fill_b` populate operands a/b; `expr(c, a, b)` +/// evaluates the expression under test; `expected(e, i)` is the reference +/// value for element i of outer element e. +template +void run_tot_expr(FillA fill_a, FillB fill_b, Expr expr, Expected expected) { + using Array = TA::DistArray, Policy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}}; + Array a(world, trange), b(world, trange); + auto range_fn = [](const auto& idx) { + return inner_range_for(idx); + }; + a.init_tiles_nested(range_fn, fill_a); + b.init_tiles_nested(range_fn, fill_b); + world.gop.fence(); + Array c; + expr(c, a, b); + world.gop.fence(); + for (const auto& tidx : c.trange().tiles_range()) { + if (!c.is_local(tidx)) continue; + auto tile = c.find(tidx).get(); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const long e = static_cast(tile.range().idx(ord)[0]); + const auto& cell = tile.data()[ord]; + BOOST_REQUIRE(!cell.empty()); + BOOST_CHECK_EQUAL(static_cast(cell.size()), inner_extent(e)); + for (std::size_t i = 0; i < cell.size(); ++i) + BOOST_CHECK_EQUAL(cell.data()[i], expected(e, static_cast(i))); + } + } +} + +/// c = a + b, element-wise over matching inner cells. +template +void test_tot_add() { + auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); }; + run_tot_expr( + fill, fill, + [](auto& c, auto& a, auto& b) { c("i;j") = a("i;j") + b("i;j"); }, + [](long e, long i) { return 2.0 * (100.0 * e + i); }); +} + +/// c = a - b, element-wise over matching inner cells. +template +void test_tot_subt() { + auto fill_a = [](auto& cell, const auto& idx) { + const long e = static_cast(idx[0]); + for (std::size_t i = 0; i < cell.size(); ++i) + cell.data()[i] = 300.0 * e + 2.0 * i; + }; + auto fill_b = [](auto& cell, const auto& idx) { fill_cell(cell, idx); }; + run_tot_expr( + fill_a, fill_b, + [](auto& c, auto& a, auto& b) { c("i;j") = a("i;j") - b("i;j"); }, + [](long e, long i) { return 200.0 * e + i; }); +} + +/// c = a * b, full Hadamard (outer and inner) over matching inner cells. +template +void test_tot_mult() { + auto fill_a = [](auto& cell, const auto& idx) { + const long e = static_cast(idx[0]); + for (std::size_t i = 0; i < cell.size(); ++i) + cell.data()[i] = static_cast(e + static_cast(i) + 1); + }; + auto fill_b = [](auto& cell, const auto&) { + for (std::size_t i = 0; i < cell.size(); ++i) cell.data()[i] = 3.0; + }; + run_tot_expr( + fill_a, fill_b, + [](auto& c, auto& a, auto& b) { c("i;j") = a("i;j") * b("i;j"); }, + [](long e, long i) { return 3.0 * (e + i + 1); }); +} + +/// c = 3 * a, scalar scaling over inner cells. +template +void test_tot_scale() { + auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); }; + run_tot_expr( + fill, fill, [](auto& c, auto& a, auto&) { c("i;j") = 3.0 * a("i;j"); }, + [](long e, long i) { return 3.0 * (100.0 * e + i); }); +} + +/// c = 3 * (a + b); exercises the scaled-add tile op (add with a factor). +template +void test_tot_scaled_add() { + auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); }; + run_tot_expr( + fill, fill, + [](auto& c, auto& a, auto& b) { c("i;j") = 3.0 * (a("i;j") + b("i;j")); }, + [](long e, long i) { return 6.0 * (100.0 * e + i); }); +} + +/// c = 3 * (a - b); exercises the scaled-subt tile op (subt with a factor). +template +void test_tot_scaled_subt() { + auto fill_a = [](auto& cell, const auto& idx) { + const long e = static_cast(idx[0]); + for (std::size_t i = 0; i < cell.size(); ++i) + cell.data()[i] = 300.0 * e + 2.0 * i; + }; + auto fill_b = [](auto& cell, const auto& idx) { fill_cell(cell, idx); }; + run_tot_expr( + fill_a, fill_b, + [](auto& c, auto& a, auto& b) { c("i;j") = 3.0 * (a("i;j") - b("i;j")); }, + [](long e, long i) { return 3.0 * (200.0 * e + i); }); +} + +/// c = -a, negation over inner cells. +template +void test_tot_neg() { + auto fill = [](auto& cell, const auto& idx) { fill_cell(cell, idx); }; + run_tot_expr( + fill, fill, [](auto& c, auto& a, auto&) { c("i;j") = -a("i;j"); }, + [](long e, long i) { return -(100.0 * e + i); }); +} + +/// End-to-end ToT contraction through TA::einsum: outer Hadamard over i,j +/// with an outer contraction over k, plus an inner contraction. This routes +/// through the regime-A arena einsum path (the outer-Hadamard "hadamard +/// reduction" branch), not the expression-DSL delegation a pure-Hadamard +/// outer would take. `annot` is the einsum string; a's inner cells are +/// `a0 x a1`, b's are `b0 x b1`. A non-canonical inner annotation exercises +/// the inner-permutation hoist. The arena-inner result is checked against a +/// Tensor> reference run of the identical expression. +template +void test_tot_einsum_contraction(const char* annot, std::size_t a0, + std::size_t a1, std::size_t b0, + std::size_t b1) { + using Array = TA::DistArray, Policy>; + using RefArray = TA::DistArray>, Policy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}, {0, 2, 4}, {0, 2}}; + + auto fill_a = [](auto& cell, const auto& idx) { + const long key = 7 * static_cast(idx[0]) + + 13 * static_cast(idx[1]) + + 31 * static_cast(idx[2]); + for (std::size_t p = 0; p < cell.size(); ++p) + cell.data()[p] = static_cast(1 + static_cast(p) + key); + }; + auto fill_b = [](auto& cell, const auto& idx) { + const long key = 5 * static_cast(idx[0]) + + 3 * static_cast(idx[1]) + + 11 * static_cast(idx[2]); + for (std::size_t p = 0; p < cell.size(); ++p) + cell.data()[p] = static_cast(2 + static_cast(p) + key); + }; + + Array a(world, trange), b(world, trange); + a.init_tiles_nested( + [a0, a1](const auto&) { return inner_range_2d(a0, a1); }, + fill_a); + b.init_tiles_nested( + [b0, b1](const auto&) { return inner_range_2d(b0, b1); }, + fill_b); + RefArray a_ref(world, trange), b_ref(world, trange); + a_ref.init_tiles_nested( + [a0, a1](const auto&) { + return inner_range_2d>(a0, a1); + }, + fill_a); + b_ref.init_tiles_nested( + [b0, b1](const auto&) { + return inner_range_2d>(b0, b1); + }, + fill_b); + world.gop.fence(); + + auto c = TA::einsum(annot, a, b); + auto c_ref = TA::einsum(annot, a_ref, b_ref); + world.gop.fence(); + + for (const auto& tidx : c.trange().tiles_range()) { + if (!c.is_local(tidx)) continue; + auto tile = c.find(tidx).get(); + auto ref_tile = c_ref.find(tidx).get(); + BOOST_REQUIRE_EQUAL(tile.range().volume(), ref_tile.range().volume()); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const auto& cell = tile.data()[ord]; + const auto& ref_cell = ref_tile.data()[ord]; + BOOST_REQUIRE(!cell.empty()); + BOOST_REQUIRE_EQUAL(cell.size(), ref_cell.size()); + for (std::size_t p = 0; p < cell.size(); ++p) + BOOST_CHECK_EQUAL(cell.data()[p], ref_cell.data()[p]); + } + } +} + +/// End-to-end T x ToT Hadamard through TA::einsum: a plain DistArray scales +/// each inner cell of a ToT array. A pure-Hadamard outer ("ij,ij;a->ij;a") +/// makes einsum delegate to the expression DSL, exercising the arena +/// `t x tot` Mult tile op. The arena-inner result is checked against an +/// identical Tensor-inner reference run (the legacy `binary` path). +template +void test_tot_einsum_t_x_tot() { + using ToTArray = TA::DistArray, Policy>; + using RefArray = TA::DistArray>, Policy>; + using PlainArray = TA::DistArray, Policy>; + TA::World& world = *GlobalFixture::world; + TA::TiledRange trange{{0, 2, 4}, {0, 2, 4}}; + + auto plain_fill = [](const TA::Range& r) { + TA::Tensor t(r); + for (std::size_t p = 0; p < t.size(); ++p) + t.data()[p] = 1.0 + static_cast(p); + return t; + }; + auto tot_fill = [](auto& cell, const auto& idx) { + const long key = + 7 * static_cast(idx[0]) + 13 * static_cast(idx[1]); + for (std::size_t p = 0; p < cell.size(); ++p) + cell.data()[p] = static_cast(2 + static_cast(p) + key); + }; + + PlainArray a(world, trange); + a.init_tiles(plain_fill); + ToTArray b(world, trange); + b.init_tiles_nested( + [](const auto&) { + return typename InnerTile::range_type(std::vector{4}); + }, + tot_fill); + RefArray b_ref(world, trange); + b_ref.init_tiles_nested( + [](const auto&) { + return TA::Tensor::range_type(std::vector{4}); + }, + tot_fill); + world.gop.fence(); + + auto c = TA::einsum("ij,ij;a->ij;a", a, b); + auto c_ref = TA::einsum("ij,ij;a->ij;a", a, b_ref); + world.gop.fence(); + + for (const auto& tidx : c.trange().tiles_range()) { + if (!c.is_local(tidx)) continue; + auto tile = c.find(tidx).get(); + auto ref_tile = c_ref.find(tidx).get(); + BOOST_REQUIRE_EQUAL(tile.range().volume(), ref_tile.range().volume()); + for (std::size_t ord = 0; ord < tile.range().volume(); ++ord) { + const auto& cell = tile.data()[ord]; + const auto& ref_cell = ref_tile.data()[ord]; + BOOST_REQUIRE(!cell.empty()); + BOOST_REQUIRE_EQUAL(cell.size(), ref_cell.size()); + for (std::size_t p = 0; p < cell.size(); ++p) + BOOST_CHECK_EQUAL(cell.data()[p], ref_cell.data()[p]); + } + } +} + +/// Tensor::permute with a bipartite permutation: outer cells +/// reorder shallowly, inner cells are permuted into a fresh slab. Here both +/// the outer and inner parts are transposes. +void test_arena_tile_permute() { + using Inner = TA::ArenaTensor; + using Outer = TA::Tensor; + constexpr long OI = 2, OJ = 3, R = 4, C = 5; + auto val = [](long oi, long oj, long ii, long ij) { + return 1.0 + oi * 1000.0 + oj * 100.0 + ii * 10.0 + ij; + }; + Outer tile = TA::detail::make_nested_tile( + TA::Range{OI, OJ}, + [](const auto&) { return inner_range_2d(R, C); }, + [&val](auto& cell, const auto& idx) { + const long oi = static_cast(idx[0]); + const long oj = static_cast(idx[1]); + for (long ii = 0; ii < R; ++ii) + for (long ij = 0; ij < C; ++ij) + cell.data()[ii * C + ij] = val(oi, oj, ii, ij); + }); + + // bipartite transpose over the combined index space {0,1 | 2,3}: outer + // part transposes dims 0,1 and inner part transposes dims 2,3; the trailing + // 2 marks the second (inner) partition size. + TA::BipartitePermutation bperm(TA::Permutation{1, 0, 3, 2}, 2); + Outer p = tile.permute(bperm); + + // outer range transposed: {OI,OJ} -> {OJ,OI} + BOOST_REQUIRE_EQUAL(p.range().extent(0), OJ); + BOOST_REQUIRE_EQUAL(p.range().extent(1), OI); + for (long oi = 0; oi < OI; ++oi) + for (long oj = 0; oj < OJ; ++oj) { + // src outer (oi,oj) lands at result outer (oj,oi) + const auto& cell = p.data()[oj * OI + oi]; + BOOST_REQUIRE(!cell.empty()); + BOOST_REQUIRE_EQUAL(static_cast(cell.size()), R * C); + // inner transposed: result cell range {R,C} -> {C,R} + BOOST_CHECK_EQUAL(cell.range().extent(0), C); + BOOST_CHECK_EQUAL(cell.range().extent(1), R); + for (long ii = 0; ii < R; ++ii) + for (long ij = 0; ij < C; ++ij) + BOOST_CHECK_EQUAL(cell.data()[ij * R + ii], val(oi, oj, ii, ij)); + } +} + +} // namespace + +BOOST_AUTO_TEST_SUITE(tot_construction_suite, TA_UT_LABEL_SERIAL) + +BOOST_AUTO_TEST_CASE(make_nested_tile_tensor_inner) { + test_make_nested_tile>(); +} + +BOOST_AUTO_TEST_CASE(make_nested_tile_arena_inner) { + test_make_nested_tile>(); +} + +BOOST_AUTO_TEST_CASE(dist_array_tot_ctor_tensor_inner) { + test_dist_array_tot_ctor, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(dist_array_tot_ctor_arena_inner) { + test_dist_array_tot_ctor, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(init_tiles_nested_tensor_inner) { + test_init_tiles_nested, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(init_tiles_nested_arena_inner) { + test_init_tiles_nested, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(fill_random_tensor_inner) { + test_fill_random, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(fill_random_arena_inner) { + test_fill_random, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(init_elements_tensor_inner) { + test_init_elements, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(init_elements_arena_inner) { + test_init_elements, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(fill_arena_inner) { test_fill_arena(); } + +BOOST_AUTO_TEST_CASE(set_value_arena_inner) { test_set_value_arena(); } + +BOOST_AUTO_TEST_CASE(set_iter_arena_inner) { test_set_iter_arena(); } + +BOOST_AUTO_TEST_CASE(add_tensor_inner) { + test_tot_add, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(add_arena_inner) { + test_tot_add, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(subt_tensor_inner) { + test_tot_subt, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(subt_arena_inner) { + test_tot_subt, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(mult_tensor_inner) { + test_tot_mult, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(mult_arena_inner) { + test_tot_mult, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(scaled_add_tensor_inner) { + test_tot_scaled_add, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(scaled_add_arena_inner) { + test_tot_scaled_add, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(scaled_subt_tensor_inner) { + test_tot_scaled_subt, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(scaled_subt_arena_inner) { + test_tot_scaled_subt, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(scale_tensor_inner) { + test_tot_scale, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(scale_arena_inner) { + test_tot_scale, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(neg_tensor_inner) { + test_tot_neg, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(neg_arena_inner) { + test_tot_neg, TA::DensePolicy>(); +} + +// canonical inner contraction: c(ij;mn) = sum_k sum_o a(ijk;mo) b(ijk;on) +BOOST_AUTO_TEST_CASE(einsum_contraction_tensor_inner) { + test_tot_einsum_contraction, TA::DensePolicy>( + "ijk;mo,ijk;on->ij;mn", 2, 3, 3, 2); +} +BOOST_AUTO_TEST_CASE(einsum_contraction_arena_inner) { + test_tot_einsum_contraction, TA::DensePolicy>( + "ijk;mo,ijk;on->ij;mn", 2, 3, 3, 2); +} + +// non-canonical inner annotations: operand A reordered (o,m) and the result +// reordered (n,m) -- exercises the regime-A inner-permutation hoist. +BOOST_AUTO_TEST_CASE(einsum_contraction_perm_tensor_inner) { + test_tot_einsum_contraction, TA::DensePolicy>( + "ijk;om,ijk;on->ij;nm", 3, 2, 3, 2); +} +BOOST_AUTO_TEST_CASE(einsum_contraction_perm_arena_inner) { + test_tot_einsum_contraction, TA::DensePolicy>( + "ijk;om,ijk;on->ij;nm", 3, 2, 3, 2); +} + +// inner Hadamard with a permuted operand: c(ij;mn) = sum_k a(ijk;mn) b(ijk;nm) +BOOST_AUTO_TEST_CASE(einsum_hadamard_perm_tensor_inner) { + test_tot_einsum_contraction, TA::DensePolicy>( + "ijk;mn,ijk;nm->ij;mn", 2, 3, 3, 2); +} +BOOST_AUTO_TEST_CASE(einsum_hadamard_perm_arena_inner) { + test_tot_einsum_contraction, TA::DensePolicy>( + "ijk;mn,ijk;nm->ij;mn", 2, 3, 3, 2); +} + +// plain T x ToT Hadamard: c(ij;a) = a(ij) * b(ij;a), routed through the +// expression-DSL Mult tile op. +BOOST_AUTO_TEST_CASE(einsum_t_x_tot_tensor_inner) { + test_tot_einsum_t_x_tot, TA::DensePolicy>(); +} +BOOST_AUTO_TEST_CASE(einsum_t_x_tot_arena_inner) { + test_tot_einsum_t_x_tot, TA::DensePolicy>(); +} + +BOOST_AUTO_TEST_CASE(arena_tile_bipartite_permute) { + test_arena_tile_permute(); +} + +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE(tot_construction_dist_suite, TA_UT_LABEL_DISTRIBUTED) + +BOOST_AUTO_TEST_CASE(arena_tot_remote_tile_transport) { + test_distributed_arena_tot(); +} + +BOOST_AUTO_TEST_SUITE_END()