diff --git a/README.md b/README.md index 2b95ed1..a368da1 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,12 @@ This repository holds the [MMTk](https://www.mmtk.io/) bindings for Ruby. The bi After building Ruby and the MMTk bindings, run Ruby with `RUBY_GC_LIBRARY=mmtk` environment variable. You can also configure the following environment variables: - `MMTK_PLAN=`: Configures the GC algorithm used by MMTk. Defaults to `Immix`. -- `MMTK_HEAP_MODE=`: Configures the MMTk heap used. `fixed` is a fixed size heap, `dynamic` is a dynamic sized heap that will grow and shrink in size based on heuristics using the [MemBalancer](https://dl.acm.org/doi/pdf/10.1145/3563323) algorithm. Defaults to `dynamic`. -- `MMTK_HEAP_MIN=`: Configures the lower bound in heap memory usage by MMTk. Only valid when `MMTK_HEAP_MODE=dynamic`. `size` is in bytes, but you can also append `KiB`, `MiB`, `GiB` for larger sizes. Defaults to 1MiB. +- `MMTK_HEAP_MODE=`: Configures the MMTk heap used. Defaults to `dynamic`. + - `fixed`: a fixed size heap. + - `dynamic`: a dynamic sized heap that will grow and shrink in size based on heuristics using the [MemBalancer](https://dl.acm.org/doi/pdf/10.1145/3563323) algorithm. + - `ruby`: a dynamic sized heap that grows and shrinks based on the ratio of free to used slots, using the same `RUBY_GC_HEAP_FREE_SLOTS_*_RATIO` env vars as the default Ruby GC. + - `cpu`: a dynamic sized heap that adjusts itself to hit a target GC CPU overhead, using the algorithm from [Tavakolisomeh et al., "Heap Size Adjustment with CPU Control" (MPLR '23)](https://dl.acm.org/doi/10.1145/3617651.3622988). Tunable via `MMTK_GC_CPU_TARGET` and `MMTK_GC_CPU_WINDOW` (see below). +- `MMTK_HEAP_MIN=`: Configures the lower bound in heap memory usage by MMTk. Only valid when `MMTK_HEAP_MODE` is `dynamic`, `ruby`, or `cpu`. `size` is in bytes, but you can also append `KiB`, `MiB`, `GiB` for larger sizes. Defaults to 1MiB. - `MMTK_HEAP_MAX=`: Configures the upper bound in heap memory usage by MMTk. Once this limit is reached and no objects can be garbage collected, it will crash with an out-of-memory. `size` is in bytes, but you can also append `KiB`, `MiB`, `GiB` for larger sizes. Defaults to 80% of your system RAM. +- `MMTK_GC_CPU_TARGET=`: Target GC CPU overhead, as a percentage, when `MMTK_HEAP_MODE=cpu`. After each GC cycle, the heap is grown if the measured GC CPU overhead exceeds this target and shrunk if it falls below. Defaults to `5`. The paper recommends `15` for the concurrent collector it targets (ZGC), but on MMTk-Ruby's stop-the-world Immix every percent of GC CPU also blocks the mutator, so a smaller budget gives better throughput. Empirical sweeps across ruby-bench find 5 Pareto-optimal vs. the `ruby` heap mode (~6% geomean speedup at essentially equal peak RSS). +- `MMTK_GC_CPU_WINDOW=`: Number of recent GC cycles averaged when measuring GC CPU overhead for `MMTK_HEAP_MODE=cpu`. Larger values smooth the signal at the cost of responsiveness. Defaults to `3`. diff --git a/bin/compare-heap-modes b/bin/compare-heap-modes new file mode 100755 index 0000000..f2b6184 --- /dev/null +++ b/bin/compare-heap-modes @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# Compare MMTk heap modes on ruby-bench. +# +# Runs the ruby-bench suite (expected checked out at $RUBY_BENCH_DIR, default +# ../ruby-bench) with two Ruby "executables" that are the same modular-GC +# Ruby, but wrapped so each sets a different MMTK_HEAP_MODE. +# +# Required env: +# RUBY_BIN Path to a Ruby built with --with-modular-gc and the +# MMTk binding installed. (e.g. ~/.rubies/ruby-mmtk/bin/ruby) +# +# Optional env: +# RUBY_BENCH_DIR Path to ruby-bench checkout (default: ../ruby-bench) +# MODES Space-separated list of heap modes to compare +# (default: "ruby cpu"). Others: "fixed dynamic". +# BENCHES Space-separated list of benchmarks to run (default: a +# curated small-but-GC-sensitive set). Pass empty string +# "" to run the whole default suite. +# WARMUP WARMUP_ITRS (default 5) +# BENCH MIN_BENCH_ITRS (default 10) +# TIME MIN_BENCH_TIME (default 20) +# MMTK_GC_CPU_TARGET CPU overhead target for `cpu` mode (default 5) +# MMTK_GC_CPU_WINDOW averaging window for `cpu` mode (default 3) +# +# Example: +# RUBY_BIN=~/.rubies/ruby-mmtk/bin/ruby \ +# bin/compare-heap-modes +# +# RUBY_BIN=~/.rubies/ruby-mmtk/bin/ruby MODES="ruby cpu dynamic" \ +# BENCHES="liquid-render psych-load railsbench" \ +# bin/compare-heap-modes + +set -euo pipefail + +if [ -z "${RUBY_BIN:-}" ]; then + echo "error: RUBY_BIN must be set to a Ruby built with --with-modular-gc" >&2 + exit 64 +fi +if [ ! -x "$RUBY_BIN" ]; then + echo "error: RUBY_BIN=$RUBY_BIN is not executable" >&2 + exit 64 +fi + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +BENCH_DIR="${RUBY_BENCH_DIR:-$REPO_ROOT/../ruby-bench}" +if [ ! -d "$BENCH_DIR" ]; then + echo "error: ruby-bench checkout not found at $BENCH_DIR" >&2 + echo " clone it with: git clone https://github.com/ruby/ruby-bench $BENCH_DIR" >&2 + exit 66 +fi + +# Put RUBY_BIN's bin directory first on PATH so `bundle exec`, `ruby`, and any +# shebangs invoking `ruby` resolve to the modular-GC Ruby instead of whatever +# system Ruby comes first in the caller's environment. +RUBY_BIN_DIR="$(cd "$(dirname "$RUBY_BIN")" && pwd)" +export PATH="$RUBY_BIN_DIR:$PATH" + +# Clear gem-path vars that might still point at a different Ruby's gems. +unset GEM_HOME GEM_PATH BUNDLE_PATH RUBYLIB RUBYOPT 2>/dev/null || true + +MODES=${MODES:-"ruby cpu"} +# A curated GC-sensitive subset. Override with BENCHES="". +DEFAULT_BENCHES="liquid-render psych-load railsbench lee binarytrees" +if [ -z "${BENCHES+x}" ]; then + BENCHES="$DEFAULT_BENCHES" +fi + +export WARMUP_ITRS="${WARMUP:-5}" +export MIN_BENCH_ITRS="${BENCH:-10}" +export MIN_BENCH_TIME="${TIME:-20}" + +# Export tunables so all wrapped runs see the same values. The `ruby` mode +# ignores MMTK_GC_CPU_*; the `cpu` mode ignores RUBY_GC_HEAP_*. +export MMTK_GC_CPU_TARGET="${MMTK_GC_CPU_TARGET:-5}" +export MMTK_GC_CPU_WINDOW="${MMTK_GC_CPU_WINDOW:-3}" + +WRAPPER="$REPO_ROOT/bin/ruby-mmtk-mode" +RUBY_ARGS=() +for mode in $MODES; do + RUBY_ARGS+=(-e "mmtk-$mode::$WRAPPER $mode -- ") +done + +cd "$BENCH_DIR" + +echo "== compare-heap-modes ==" +echo "ruby_bin: $RUBY_BIN" +echo "modes: $MODES" +echo "benches: ${BENCHES:-}" +echo "warmup: $WARMUP_ITRS" +echo "bench: $MIN_BENCH_ITRS iters / $MIN_BENCH_TIME s min" +echo "cpu target:$MMTK_GC_CPU_TARGET% window=$MMTK_GC_CPU_WINDOW" +echo "---" + +# `--rss` records peak RSS per run, essential for comparing memory footprint +# between heap-sizing policies. +# `--no-sudo` skips CPU governor / turbo tweaks that would need root. +export RUBY_BIN +exec bundle exec ./run_benchmarks.rb --no-sudo --rss "${RUBY_ARGS[@]}" ${BENCHES:-} diff --git a/bin/ruby-mmtk-mode b/bin/ruby-mmtk-mode new file mode 100755 index 0000000..a82ddbb --- /dev/null +++ b/bin/ruby-mmtk-mode @@ -0,0 +1,48 @@ +#!/bin/sh +# Wrapper that invokes a modular-GC Ruby with MMTk + a specific MMTK_HEAP_MODE. +# +# ruby-bench's run_benchmarks.rb compares Ruby executables passed via `-e`. +# Each `-e` entry is a single command line, so to compare two MMTk heap modes +# we need one executable per mode with the relevant env vars already baked in. +# +# Usage: +# bin/ruby-mmtk-mode [-- extra env VAR=VAL ...] -- +# +# The caller is expected to set RUBY_BIN to the path of a Ruby built with +# --with-modular-gc and the MMTk binding installed (or to have it on $PATH). +# +# Examples: +# RUBY_BIN=$HOME/.rubies/ruby-mmtk/bin/ruby bin/ruby-mmtk-mode ruby -- -e 'puts GC.config' +# RUBY_BIN=$HOME/.rubies/ruby-mmtk/bin/ruby bin/ruby-mmtk-mode cpu -- -e 'puts GC.config' + +set -eu + +if [ $# -lt 1 ]; then + echo "usage: $0 [VAR=VAL ...] -- " >&2 + exit 64 +fi + +MODE=$1 +shift + +# Optional additional env vars before the `--` separator. +while [ $# -gt 0 ] && [ "$1" != "--" ]; do + case "$1" in + *=*) export "$1" ;; + *) + echo "$0: expected VAR=VAL or --, got: $1" >&2 + exit 64 + ;; + esac + shift +done +if [ $# -gt 0 ] && [ "$1" = "--" ]; then + shift +fi + +RUBY=${RUBY_BIN:-ruby} + +exec env \ + RUBY_GC_LIBRARY=mmtk \ + MMTK_HEAP_MODE="$MODE" \ + "$RUBY" "$@" diff --git a/bin/smoke-test b/bin/smoke-test new file mode 100755 index 0000000..a083364 --- /dev/null +++ b/bin/smoke-test @@ -0,0 +1,113 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# Smoke test for MMTk heap modes. +# +# Runs an allocation-heavy loop under a given MMTK_HEAP_MODE and reports: +# - the mode Ruby actually booted with (GC.config) +# - GC cycle count triggered during the loop +# - wall-clock time and process CPU time +# - peak resident set size (peak RSS) +# +# Usage (after `rake install:release` against a modular-GC Ruby): +# +# bin/smoke-test # defaults to MMTK_HEAP_MODE=cpu +# MMTK_HEAP_MODE=ruby bin/smoke-test +# MMTK_HEAP_MODE=cpu MMTK_GC_CPU_TARGET=10 bin/smoke-test +# SMOKE_ITERATIONS=2_000_000 bin/smoke-test # longer run for trigger to adapt +# +# If this script is run without RUBY_GC_LIBRARY=mmtk set, it will re-exec +# itself with that env var plus whatever other MMTK_* vars you passed. + +unless ENV["RUBY_GC_LIBRARY"] == "mmtk" + ENV["RUBY_GC_LIBRARY"] = "mmtk" + ENV["MMTK_HEAP_MODE"] ||= "cpu" + exec(RbConfig.ruby, __FILE__, *ARGV) +end + +impl = GC.config[:implementation] +unless impl == "mmtk" + abort "smoke-test: expected GC implementation 'mmtk', got #{impl.inspect}. " \ + "Is your Ruby built with --with-modular-gc and is the binding installed?" +end + +require "fiddle" + +# getrusage(RUSAGE_SELF) returns peak RSS in ru_maxrss. On macOS the value is +# in bytes; on Linux it's in kilobytes. +module Rusage + extend self + + RUSAGE_SELF = 0 + + # struct rusage on macOS/Linux: first two fields are ru_utime / ru_stime + # (struct timeval = { long, long }), then a series of long integers. + # ru_maxrss is the 3rd long integer (offset after the 2 timevals). + # Each field here is a 64-bit long on 64-bit platforms. + # Layout (all i64): + # [0..1] ru_utime (sec, usec) + # [2..3] ru_stime (sec, usec) + # [4] ru_maxrss <-- what we want + # ... more fields we don't use + STRUCT_LONGS = 18 + + def peak_rss_bytes + libc = Fiddle::Handle::DEFAULT + getrusage = Fiddle::Function.new( + libc["getrusage"], [Fiddle::TYPE_INT, Fiddle::TYPE_VOIDP], Fiddle::TYPE_INT + ) + buf = Fiddle::Pointer.malloc(STRUCT_LONGS * Fiddle::SIZEOF_LONG, Fiddle::RUBY_FREE) + raise "getrusage failed" unless getrusage.call(RUSAGE_SELF, buf) == 0 + maxrss = buf[4 * Fiddle::SIZEOF_LONG, Fiddle::SIZEOF_LONG].unpack1("q") + # macOS reports bytes, Linux reports kilobytes. + RbConfig::CONFIG["host_os"].include?("darwin") ? maxrss : maxrss * 1024 + end +end + +puts "== MMTk smoke test ==" +puts "implementation: #{GC.config[:implementation]}" +puts "mmtk_plan: #{GC.config[:mmtk_plan]}" +puts "mmtk_heap_mode: #{GC.config[:mmtk_heap_mode]}" +puts "mmtk_heap_min: #{GC.config[:mmtk_heap_min]}" if GC.config[:mmtk_heap_min] +puts "mmtk_heap_max: #{GC.config[:mmtk_heap_max]}" +puts "mmtk_worker_count: #{GC.config[:mmtk_worker_count]}" +if GC.config[:mmtk_heap_mode] == "cpu" + puts "cpu target (env): #{ENV.fetch('MMTK_GC_CPU_TARGET', '5')}%" + puts "cpu window (env): #{ENV.fetch('MMTK_GC_CPU_WINDOW', '3')}" +end +puts "---" + +ITERATIONS = Integer(ENV.fetch("SMOKE_ITERATIONS", 500_000)) +OBJECT_SIZE = Integer(ENV.fetch("SMOKE_OBJECT_SIZE", 256)) +LIVE_SET = Integer(ENV.fetch("SMOKE_LIVE_SET", 2_000)) + +# The workload: maintain a rolling working set of LIVE_SET objects, each +# OBJECT_SIZE bytes. Each iteration allocates a new object and drops an old +# one. This produces a steady stream of garbage and a predictable live-set +# size, so the CPU trigger has a stable signal to converge on. + +gc_before = GC.count +t_wall_start = Process.clock_gettime(Process::CLOCK_MONOTONIC) +t_cpu_start = Process.clock_gettime(Process::CLOCK_PROCESS_CPUTIME_ID) + +sink = Array.new(LIVE_SET) { String.new("x" * OBJECT_SIZE) } +i = 0 +while i < ITERATIONS + sink[i % LIVE_SET] = String.new("x" * OBJECT_SIZE) + i += 1 +end + +t_wall_end = Process.clock_gettime(Process::CLOCK_MONOTONIC) +t_cpu_end = Process.clock_gettime(Process::CLOCK_PROCESS_CPUTIME_ID) +gc_after = GC.count + +wall_s = t_wall_end - t_wall_start +cpu_s = t_cpu_end - t_cpu_start +rss = Rusage.peak_rss_bytes + +printf "iterations: %d (live set %d x %dB)\n", ITERATIONS, LIVE_SET, OBJECT_SIZE +printf "gc cycles: %d (before=%d, after=%d)\n", (gc_after - gc_before), gc_before, gc_after +printf "wall time: %.3fs\n", wall_s +printf "cpu time: %.3fs (%.1f%% of wall)\n", cpu_s, (cpu_s / wall_s) * 100.0 +printf "peak rss: %.1f MiB (%d bytes)\n", rss / 1024.0 / 1024.0, rss +puts "OK" diff --git a/doc/testing-cpu-heap-mode.md b/doc/testing-cpu-heap-mode.md new file mode 100644 index 0000000..6bc2212 --- /dev/null +++ b/doc/testing-cpu-heap-mode.md @@ -0,0 +1,196 @@ +# Testing the `cpu` heap mode against `ruby` + +This walks through building a modular-GC Ruby, installing the MMTk binding, +and using [`ruby/ruby-bench`](https://github.com/ruby/ruby-bench) to compare +`MMTK_HEAP_MODE=cpu` (the [Tavakolisomeh et al. 2023][paper] policy) against +`MMTK_HEAP_MODE=ruby` (the existing free-slot-ratio policy). + +[paper]: https://dl.acm.org/doi/10.1145/3617651.3622988 + +The headline: one binary, two env-var configurations, same benchmark suite, +compare wall-clock time and peak RSS. + +## 1. Build a modular-GC Ruby + +The MMTk binding plugs into a Ruby compiled with `--with-modular-gc`. You need +Ruby master (at least the commit where the modular-GC API landed, +[Feature #20470](https://bugs.ruby-lang.org/issues/20470)). + +```sh +# Pick a location — the rest of this doc assumes ~/src/ruby/ruby. +cd ~/src/ruby +git clone https://github.com/ruby/ruby.git +cd ruby + +./autogen.sh +./configure \ + --prefix="$HOME/.rubies/ruby-mmtk" \ + --with-modular-gc=./gc \ + --disable-install-doc \ + --enable-shared +make -j"$(nproc 2>/dev/null || sysctl -n hw.ncpu)" +make install +``` + +Verify: + +```sh +~/.rubies/ruby-mmtk/bin/ruby -v +~/.rubies/ruby-mmtk/bin/ruby -e 'puts RbConfig::CONFIG["configure_args"]' | tr ' ' '\n' | grep modular-gc +# => '--with-modular-gc=./gc' +``` + +## 2. Build and install the MMTk binding + +From this repository: + +```sh +# Install Rust if you don't have it: https://rustup.rs +cd ~/src/github.com/ruby/mmtk + +# Use the modular-GC Ruby we just built. +export PATH="$HOME/.rubies/ruby-mmtk/bin:$PATH" +hash -r +which ruby # => ~/.rubies/ruby-mmtk/bin/ruby + +bundle install +bundle exec rake install:release # or install:debug while iterating +``` + +`rake install:*` compiles the Rust crate and copies `librubygc.mmtk.{so,dylib}` +into `RbConfig::CONFIG["modular_gc_dir"]` of the Ruby we just built. + +## 3. Smoke test + +Before running a full benchmark suite, confirm the binding is wired up and +your new `cpu` mode boots: + +```sh +RUBY_BIN=~/.rubies/ruby-mmtk/bin/ruby + +# Baseline: existing 'ruby' policy. +"$RUBY_BIN" -e 'require "rbconfig"; ENV["RUBY_GC_LIBRARY"]="mmtk"; ENV["MMTK_HEAP_MODE"]="ruby"; exec(RbConfig.ruby, "-e", "p GC.config")' + +# New: the CPU-controlled policy. +"$RUBY_BIN" -e 'require "rbconfig"; ENV["RUBY_GC_LIBRARY"]="mmtk"; ENV["MMTK_HEAP_MODE"]="cpu"; exec(RbConfig.ruby, "-e", "p GC.config")' + +# Or use the convenience wrapper in this repo: +RUBY_BIN="$RUBY_BIN" \ + bin/ruby-mmtk-mode cpu -- -e 'p GC.config' + +# Or the smoke-test script, which also allocates and runs GC a few times: +RUBY_BIN="$RUBY_BIN" MMTK_HEAP_MODE=cpu bin/smoke-test +``` + +Expected output includes `:implementation=>"mmtk"` and +`:mmtk_heap_mode=>"cpu"` (or `"ruby"`). + +## 4. Run the existing Rust and Ruby test suites + +Still inside this repo: + +```sh +cargo test --manifest-path gc/mmtk/Cargo.toml # Rust unit tests +bundle exec rake test # Ruby integration tests +``` + +The Ruby test suite includes `test_MMTK_HEAP_MODE_cpu` which confirms the mode +parses correctly. + +## 5. Clone and prepare `ruby/ruby-bench` + +```sh +cd ~/src/github.com/ruby +git clone https://github.com/ruby/ruby-bench.git +cd ruby-bench +bundle install +``` + +Sanity check with the system Ruby: + +```sh +./run_benchmarks.rb --once fib +``` + +## 6. Compare `ruby` vs `cpu` on a GC-sensitive subset + +This repo ships a driver script that wires the wrapper into `ruby-bench`: + +```sh +cd ~/src/github.com/ruby/mmtk +RUBY_BIN=~/.rubies/ruby-mmtk/bin/ruby \ + bin/compare-heap-modes +``` + +Defaults: +- `MODES="ruby cpu"` (compares the two delegated heap modes) +- `BENCHES="liquid-render psych-load railsbench lee binarytrees"` — GC-sensitive + macrobenchmarks with meaningful allocation rates. +- `WARMUP=5 BENCH=10 TIME=20` — enough to get through at least a few GC cycles + per iteration so the `cpu` trigger's 3-cycle window is populated. +- `--rss` is always passed so peak RSS appears in the results table. + +Knobs: + +```sh +# Target 10% GC CPU overhead instead of the default 5%. +RUBY_BIN=... MMTK_GC_CPU_TARGET=10 bin/compare-heap-modes + +# Add more modes to the comparison. +RUBY_BIN=... MODES="fixed dynamic ruby cpu" bin/compare-heap-modes + +# Different benches. +RUBY_BIN=... BENCHES="optcarrot activerecord" bin/compare-heap-modes + +# Entire default suite. +RUBY_BIN=... BENCHES="" bin/compare-heap-modes +``` + +Output lands in `ruby-bench/data/output_*.{csv,json,txt}`. The console prints +a table like: + +``` +---------------- ----------- ---------- --------- ---------- -------- +bench mmtk-ruby stddev (%) mmtk-cpu stddev (%) ruby/cpu +liquid-render 345.0 1.2 312.0 1.8 1.11 +psych-load 512.3 0.8 498.7 1.1 1.03 +... +``` + +With `--rss`, RSS columns are appended per executable. The ratio `ruby/cpu` +shows throughput speedup (numbers >1 mean `cpu` is faster). Compare RSS +columns for the memory tradeoff. + +## 7. Interpret the results + +What to look for, and what the paper predicts: + +| Metric | Expected with `cpu` mode | +|--------|--------------------------| +| Wall-clock time | At the default `MMTK_GC_CPU_TARGET=5` typically a few percent faster than `ruby` mode on a geomean of GC-sensitive workloads, with rare regressions | +| Peak RSS | Within a few percent of `ruby` at the default 5% target; meaningfully lower (~20%) on allocation-heavy workloads like lobsters; higher (10–40%) at very low targets (1–2%) where the trigger grows the heap aggressively | +| GC count | Generally lower than `ruby` (the `cpu` mode keeps the heap large enough to stay under the CPU budget) | +| RSS vs target | Lower targets (1–3%) ⇒ more memory, fewer GCs, faster; higher targets (10–40%) ⇒ less memory, more GC, slower throughput | + +If the `cpu` mode blows up RSS or never converges, check: + +1. `GC.config[:mmtk_heap_max]` — confirms the upper bound is sane. +2. Per-GC logging: set `RUST_LOG=mmtk_ruby::heap::cpu_heap_trigger=debug` + (the trigger emits a `debug!` after each non-nursery GC with the current + `gc_cpu`, `factor`, and new `pages`). +3. Run with `MMTK_GC_CPU_WINDOW=1` to make the control loop maximally + responsive, or `=5` to smooth more. + +## 8. Notes and caveats + +- The paper targets ZGC, a concurrent generational collector. MMTk-Ruby + currently ships stop-the-world Immix/MarkSweep. The control law is the + same, but the absolute `gc_cpu` numbers will differ from the paper's. +- `CLOCK_PROCESS_CPUTIME_ID` sums CPU time across *all* threads of the + process, which on Ractor-using workloads correctly credits parallel mutator + work and parallel GC work. On single-threaded workloads it tracks wall + clock for the mutator phase. +- Nursery-only GCs are skipped by the trigger (consistent with MemBalancer), + so with a generational plan the `cpu` policy only re-sizes at major GCs. +- `ruby-bench`'s `--interleave` flag alternates between executables to cancel + thermal drift; worth adding when comparing small effect sizes. diff --git a/gc/mmtk/src/api.rs b/gc/mmtk/src/api.rs index b9797f6..ed592c6 100644 --- a/gc/mmtk/src/api.rs +++ b/gc/mmtk/src/api.rs @@ -14,7 +14,9 @@ use crate::abi::RubyBindingOptions; use crate::abi::RubyUpcalls; use crate::binding; use crate::binding::RubyBinding; +use crate::heap::CpuHeapTriggerConfig; use crate::heap::RubyHeapTriggerConfig; +use crate::heap::CPU_HEAP_TRIGGER_CONFIG; use crate::heap::RUBY_HEAP_TRIGGER_CONFIG; use crate::mmtk; use crate::utils::default_heap_max; @@ -131,6 +133,42 @@ fn mmtk_builder_default_parse_heap_mode(heap_min: usize, heap_max: usize) -> GCT Some(GCTriggerSelector::Delegated) } + "cpu" => { + // CPU-overhead-driven heap sizing based on Tavakolisomeh et al., + // "Heap Size Adjustment with CPU Control", MPLR '23. + // + // Target is expressed as a percentage (0, 100) via + // `MMTK_GC_CPU_TARGET`. The paper recommends 15 for ZGC (a + // concurrent collector); we default to 5 for MMTk-Ruby. With + // MMTk's stop-the-world Immix, every percent of GC CPU is also + // a percent of wall-clock the mutator is blocked on, so a much + // smaller budget is appropriate. An empirical sweep across + // ruby-bench (railsbench, lobsters, psych-load, liquid-render, + // lee) found target=5 to be Pareto-optimal: ~6% geomean speedup + // vs. the `ruby` heap mode with effectively identical geomean + // peak RSS. + let target_percent = parse_float_env_var("MMTK_GC_CPU_TARGET", 5.0, 0.0, 100.0); + let window_size = parse_env_var::("MMTK_GC_CPU_WINDOW").unwrap_or(3); + let window_size = window_size.max(1); + + let min_heap_pages = conversions::bytes_to_pages_up(heap_min); + let max_heap_pages = conversions::bytes_to_pages_up(heap_max); + // Start at the min heap size, as the other delegated triggers do. + // The control loop will adjust from here after the first GC cycle. + let initial_heap_pages = min_heap_pages; + + CPU_HEAP_TRIGGER_CONFIG + .set(CpuHeapTriggerConfig { + min_heap_pages, + max_heap_pages, + initial_heap_pages, + target_gc_cpu: target_percent / 100.0, + window_size, + }) + .unwrap_or_else(|_| panic!("CPU_HEAP_TRIGGER_CONFIG is already set")); + + Some(GCTriggerSelector::Delegated) + } _ => None, }) .unwrap_or_else(make_dynamic) @@ -449,11 +487,20 @@ pub extern "C" fn mmtk_heap_mode() -> *const u8 { static FIXED_HEAP: &[u8] = b"fixed\0"; static DYNAMIC_HEAP: &[u8] = b"dynamic\0"; static RUBY_HEAP: &[u8] = b"ruby\0"; + static CPU_HEAP: &[u8] = b"cpu\0"; match *crate::BINDING.get().unwrap().mmtk.get_options().gc_trigger { GCTriggerSelector::FixedHeapSize(_) => FIXED_HEAP.as_ptr(), GCTriggerSelector::DynamicHeapSize(_, _) => DYNAMIC_HEAP.as_ptr(), - GCTriggerSelector::Delegated => RUBY_HEAP.as_ptr(), + GCTriggerSelector::Delegated => { + // Two delegated triggers exist; disambiguate via the populated + // config singleton. + if CPU_HEAP_TRIGGER_CONFIG.get().is_some() { + CPU_HEAP.as_ptr() + } else { + RUBY_HEAP.as_ptr() + } + } } } @@ -462,12 +509,18 @@ pub extern "C" fn mmtk_heap_min() -> usize { match *crate::BINDING.get().unwrap().mmtk.get_options().gc_trigger { GCTriggerSelector::FixedHeapSize(_) => 0, GCTriggerSelector::DynamicHeapSize(min_size, _) => min_size, - GCTriggerSelector::Delegated => conversions::pages_to_bytes( - RUBY_HEAP_TRIGGER_CONFIG - .get() - .expect("RUBY_HEAP_TRIGGER_CONFIG not set") - .min_heap_pages, - ), + GCTriggerSelector::Delegated => { + if let Some(cfg) = CPU_HEAP_TRIGGER_CONFIG.get() { + conversions::pages_to_bytes(cfg.min_heap_pages) + } else { + conversions::pages_to_bytes( + RUBY_HEAP_TRIGGER_CONFIG + .get() + .expect("RUBY_HEAP_TRIGGER_CONFIG not set") + .min_heap_pages, + ) + } + } } } @@ -476,12 +529,18 @@ pub extern "C" fn mmtk_heap_max() -> usize { match *crate::BINDING.get().unwrap().mmtk.get_options().gc_trigger { GCTriggerSelector::FixedHeapSize(max_size) => max_size, GCTriggerSelector::DynamicHeapSize(_, max_size) => max_size, - GCTriggerSelector::Delegated => conversions::pages_to_bytes( - RUBY_HEAP_TRIGGER_CONFIG - .get() - .expect("RUBY_HEAP_TRIGGER_CONFIG not set") - .max_heap_pages, - ), + GCTriggerSelector::Delegated => { + if let Some(cfg) = CPU_HEAP_TRIGGER_CONFIG.get() { + conversions::pages_to_bytes(cfg.max_heap_pages) + } else { + conversions::pages_to_bytes( + RUBY_HEAP_TRIGGER_CONFIG + .get() + .expect("RUBY_HEAP_TRIGGER_CONFIG not set") + .max_heap_pages, + ) + } + } } } diff --git a/gc/mmtk/src/collection.rs b/gc/mmtk/src/collection.rs index 28daa4f..648efa4 100644 --- a/gc/mmtk/src/collection.rs +++ b/gc/mmtk/src/collection.rs @@ -1,7 +1,9 @@ use crate::abi::GCThreadTLS; use crate::api::RubyMutator; +use crate::heap::CpuHeapTrigger; use crate::heap::RubyHeapTrigger; +use crate::heap::CPU_HEAP_TRIGGER_CONFIG; use crate::mmtk; use crate::upcalls; use crate::Ruby; @@ -95,7 +97,16 @@ impl Collection for VMCollection { } fn create_gc_trigger() -> Box> { - Box::new(RubyHeapTrigger::default()) + // `GCTriggerSelector::Delegated` is currently used by two different + // heap modes: `ruby` (the Ruby-like free-slot ratio trigger) and `cpu` + // (the CPU-overhead trigger from Tavakolisomeh et al., MPLR '23). + // Which one is active is determined by which `OnceCell` config the + // `MMTK_HEAP_MODE` parser populated. + if CPU_HEAP_TRIGGER_CONFIG.get().is_some() { + Box::new(CpuHeapTrigger::default()) + } else { + Box::new(RubyHeapTrigger::default()) + } } } diff --git a/gc/mmtk/src/heap/cpu_heap_trigger.rs b/gc/mmtk/src/heap/cpu_heap_trigger.rs new file mode 100644 index 0000000..851ad79 --- /dev/null +++ b/gc/mmtk/src/heap/cpu_heap_trigger.rs @@ -0,0 +1,370 @@ +//! A GC trigger that adjusts the heap size based on the CPU overhead of GC. +//! +//! This is an implementation of the heap sizing policy described in +//! Tavakolisomeh, Shimchenko, Österlund, Bruno, Ferreira, Wrigstad, +//! "Heap Size Adjustment with CPU Control", MPLR '23. +//! +//! +//! The idea: rather than letting heap size control GC frequency, let a +//! user-supplied *target GC CPU overhead* control the heap size. After each GC +//! cycle, we measure the GC CPU overhead (fraction of process CPU time spent +//! in GC) and compare it to the target. If GC is over budget we grow the heap +//! (reducing GC frequency); if it is under budget we shrink the heap (trading +//! memory for more frequent collections). +//! +//! ## Algorithm +//! +//! After each GC cycle we compute, using an average of the last `n` cycles: +//! +//! ```text +//! GC_CPU = T_GC / T_APP (Eq. 1) +//! overhead_error = GC_CPU - target (Eq. 2) +//! sigmoid_error = 1 / (1 + e^(-overhead_error)) (Eq. 3) +//! adjustment_factor = sigmoid_error + 0.5 (in (0.5, 1.5)) (Eq. 4) +//! new_size = current_size * adjustment_factor (Eq. 5) +//! ``` +//! +//! where: +//! - `T_GC` is the wall-clock duration of each GC cycle. +//! - `T_APP` is process CPU time elapsed between consecutive GC cycles (sum of +//! CPU time over all threads — mutators, GC workers, compilers, etc.), read +//! via `clock_gettime(CLOCK_PROCESS_CPUTIME_ID)`. +//! +//! The final heap size is then clamped to the range +//! `[max(1.1 * used, min_heap_pages), max_heap_pages]`, providing 10% headroom +//! above current live memory to avoid triggering GC on an effectively-empty +//! heap. +//! +//! ## Differences from the paper +//! +//! The paper targets ZGC, a concurrent generational collector. MMTk's Ruby +//! binding currently ships stop-the-world collectors (Immix, MarkSweep). The +//! paper's formula still applies: with a STW collector the process CPU time +//! during GC closely tracks the wall-clock GC time, and mutator CPU time +//! during the mutator phase is correctly attributed. For generational plans +//! we skip nursery-only GCs, consistent with MemBalancer. + +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; +use std::sync::Mutex; + +use mmtk::util::heap::GCTriggerPolicy; +use mmtk::util::heap::SpaceStats; +use mmtk::Plan; +use mmtk::MMTK; +use once_cell::sync::OnceCell; + +use crate::Ruby; + +pub static CPU_HEAP_TRIGGER_CONFIG: OnceCell = OnceCell::new(); + +/// Configuration for the [`CpuHeapTrigger`]. +pub struct CpuHeapTriggerConfig { + /// Lower bound on heap size (in pages). The trigger will never shrink below + /// this value. + pub min_heap_pages: usize, + /// Upper bound on heap size (in pages). The trigger will never grow above + /// this value. + pub max_heap_pages: usize, + /// Initial heap size (in pages). + pub initial_heap_pages: usize, + /// Target GC CPU overhead as a fraction of total process CPU time. For + /// example, `0.15` means the policy will try to keep GC CPU usage near 15%. + /// Valid range: `(0.0, 1.0)`. + pub target_gc_cpu: f64, + /// Number of recent GC cycles averaged together when computing the CPU + /// overhead signal. Smoothes out short-term fluctuations. The paper uses 3. + pub window_size: usize, +} + +/// A single GC cycle's timing measurements. +#[derive(Clone, Copy, Debug, Default)] +struct GcSample { + /// Wall-clock seconds spent inside this GC cycle. + gc_seconds: f64, + /// Seconds of process CPU time elapsed since the previous GC cycle ended. + /// This covers both mutator time and (on multi-threaded mutators) any + /// mutator CPU time consumed in parallel with the previous GC. + app_cpu_seconds: f64, +} + +struct CpuHeapTriggerState { + /// Ring buffer of the last `window_size` samples. Oldest-first. + samples: Vec, + /// Wall-clock time when the current GC cycle started. `None` when no GC is + /// in progress. + gc_start_wall: Option, + /// Process CPU time (seconds) recorded at the end of the previous GC + /// cycle. `None` until the first cycle completes. + last_gc_end_cpu: Option, +} + +impl CpuHeapTriggerState { + fn new() -> Self { + Self { + samples: Vec::new(), + gc_start_wall: None, + last_gc_end_cpu: None, + } + } + + /// Pushes a new sample, dropping the oldest when the window is full. + fn push_sample(&mut self, sample: GcSample, window_size: usize) { + if self.samples.len() >= window_size { + self.samples.remove(0); + } + self.samples.push(sample); + } + + /// Returns the arithmetic mean GC CPU overhead across the window, or + /// `None` if we don't yet have a full sample (which happens on the first + /// GC cycle — we have no baseline for `app_cpu_seconds`). + fn mean_gc_cpu(&self) -> Option { + if self.samples.is_empty() { + return None; + } + let total_gc: f64 = self.samples.iter().map(|s| s.gc_seconds).sum(); + let total_app: f64 = self.samples.iter().map(|s| s.app_cpu_seconds).sum(); + if total_app <= 0.0 { + return None; + } + Some(total_gc / total_app) + } +} + +pub struct CpuHeapTrigger { + /// Target heap size in pages. Updated at the end of each GC cycle. + target_heap_pages: AtomicUsize, + /// Mutable timing state. Wrapped in a `Mutex` because `on_gc_start` and + /// `on_gc_end` are the only mutation sites and they are not on an + /// allocation hot path; avoiding the complexity of lock-free state is + /// worth the trivial contention. + state: Mutex, +} + +impl Default for CpuHeapTrigger { + fn default() -> Self { + let cfg = Self::get_config(); + Self { + target_heap_pages: AtomicUsize::new(cfg.initial_heap_pages), + state: Mutex::new(CpuHeapTriggerState::new()), + } + } +} + +impl GCTriggerPolicy for CpuHeapTrigger { + fn is_gc_required( + &self, + space_full: bool, + space: Option>, + plan: &dyn Plan, + ) -> bool { + // Let the plan decide, matching the other triggers. + plan.collection_required(space_full, space) + } + + fn on_gc_start(&self, _mmtk: &'static MMTK) { + let mut state = self.state.lock().unwrap(); + state.gc_start_wall = Some(std::time::Instant::now()); + } + + fn on_gc_end(&self, mmtk: &'static MMTK) { + // Skip nursery-only GCs for generational plans. The heap resizing + // decision is driven by the (much more expensive) full collections + // where the signal-to-noise ratio is high enough to be useful. + if let Some(gen_plan) = mmtk.get_plan().generational() { + if gen_plan.is_current_gc_nursery() { + return; + } + } + + let cfg = Self::get_config(); + let gc_end_cpu = process_cpu_time_seconds(); + + let mut state = self.state.lock().unwrap(); + + // Duration of this GC cycle (wall clock). + let gc_seconds = state + .gc_start_wall + .take() + .map(|start| start.elapsed().as_secs_f64()) + .unwrap_or(0.0); + + // Process CPU time elapsed since the previous GC cycle ended. We + // require at least one previous end timestamp to produce a valid + // sample — without it we cannot compute `T_APP`. + if let (Some(last_end), Some(now)) = (state.last_gc_end_cpu, gc_end_cpu) { + let app_cpu_seconds = (now - last_end).max(0.0); + // Only record non-degenerate samples to avoid poisoning the window + // with zero-time entries from back-to-back GCs. + if app_cpu_seconds > 0.0 { + state.push_sample( + GcSample { + gc_seconds, + app_cpu_seconds, + }, + cfg.window_size, + ); + } + } + state.last_gc_end_cpu = gc_end_cpu; + + // Compute the new heap size only when we have samples to average over. + if let Some(gc_cpu) = state.mean_gc_cpu() { + // Drop the lock before doing the (relatively cheap) math and + // atomic update; nothing below needs the state. + drop(state); + + let overhead_error = gc_cpu - cfg.target_gc_cpu; // Eq. (2) + let sigmoid_error = sigmoid(overhead_error); // Eq. (3) + let adjustment_factor = sigmoid_error + 0.5; // Eq. (4), range (0.5, 1.5) + + let current = self.target_heap_pages.load(Ordering::Relaxed); + let suggested = ((current as f64) * adjustment_factor) as usize; // Eq. (5) + + // Clamp: + // - upper bound: configured max + // - lower bound: max(1.1 * used, min) — 10% headroom above current + // live memory, so we never request a heap so small that GC is + // triggered immediately on return from this one. + let used = mmtk.get_plan().get_used_pages(); + let floor = ((used as f64) * 1.1).ceil() as usize; + let lower = floor.max(cfg.min_heap_pages).min(cfg.max_heap_pages); + let upper = cfg.max_heap_pages; + let new_target = suggested.clamp(lower, upper); + + self.target_heap_pages.store(new_target, Ordering::Relaxed); + + debug!( + "CpuHeapTrigger: gc_cpu={:.4} target={:.4} factor={:.4} \ + pages {} -> {} (used={}, clamp=[{}, {}])", + gc_cpu, + cfg.target_gc_cpu, + adjustment_factor, + current, + new_target, + used, + lower, + upper + ); + } + } + + fn is_heap_full(&self, plan: &dyn Plan) -> bool { + plan.get_reserved_pages() > self.target_heap_pages.load(Ordering::Relaxed) + } + + fn get_current_heap_size_in_pages(&self) -> usize { + self.target_heap_pages.load(Ordering::Relaxed) + } + + fn get_max_heap_size_in_pages(&self) -> usize { + Self::get_config().max_heap_pages + } + + fn can_heap_size_grow(&self) -> bool { + self.target_heap_pages.load(Ordering::Relaxed) < Self::get_config().max_heap_pages + } +} + +impl CpuHeapTrigger { + fn get_config<'b>() -> &'b CpuHeapTriggerConfig { + CPU_HEAP_TRIGGER_CONFIG + .get() + .expect("Attempt to use CPU_HEAP_TRIGGER_CONFIG before it is initialized") + } +} + +/// Standard logistic sigmoid. Returns 0.5 when x == 0, asymptotes to 0 and 1. +fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x).exp()) +} + +/// Reads the process-wide CPU time as a floating-point number of seconds, +/// summed across all threads of this process. Returns `None` if the clock +/// query fails (which should be essentially impossible on supported +/// platforms). +fn process_cpu_time_seconds() -> Option { + let mut ts = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + // SAFETY: `clock_gettime` writes exactly `sizeof(timespec)` bytes to the + // pointer we pass, which is a valid local stack allocation. + let rc = unsafe { libc::clock_gettime(libc::CLOCK_PROCESS_CPUTIME_ID, &mut ts) }; + if rc != 0 { + return None; + } + Some((ts.tv_sec as f64) + (ts.tv_nsec as f64) / 1_000_000_000.0) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sigmoid_is_well_behaved() { + assert!((sigmoid(0.0) - 0.5).abs() < 1e-12); + assert!(sigmoid(-100.0) < 1e-9); + assert!(sigmoid(100.0) > 1.0 - 1e-9); + // Monotonic. + assert!(sigmoid(-1.0) < sigmoid(0.0)); + assert!(sigmoid(0.0) < sigmoid(1.0)); + } + + #[test] + fn adjustment_factor_is_within_paper_bounds() { + // Eq. (4): adjustment_factor = sigmoid(e) + 0.5 must lie in (0.5, 1.5). + for e in [-10.0_f64, -1.0, 0.0, 1.0, 10.0] { + let f = sigmoid(e) + 0.5; + assert!(f > 0.5 && f < 1.5, "factor {f} out of range for e={e}"); + } + } + + #[test] + fn mean_gc_cpu_is_total_weighted() { + let mut state = CpuHeapTriggerState::new(); + state.push_sample( + GcSample { + gc_seconds: 1.0, + app_cpu_seconds: 10.0, + }, + 3, + ); + state.push_sample( + GcSample { + gc_seconds: 3.0, + app_cpu_seconds: 10.0, + }, + 3, + ); + // (1 + 3) / (10 + 10) = 0.2 + assert!((state.mean_gc_cpu().unwrap() - 0.2).abs() < 1e-12); + } + + #[test] + fn window_drops_oldest() { + let mut state = CpuHeapTriggerState::new(); + for i in 0..5 { + state.push_sample( + GcSample { + gc_seconds: i as f64, + app_cpu_seconds: 1.0, + }, + 3, + ); + } + assert_eq!(state.samples.len(), 3); + // After pushing 0,1,2,3,4 with window 3, we should have [2,3,4]. + assert_eq!(state.samples[0].gc_seconds, 2.0); + assert_eq!(state.samples[2].gc_seconds, 4.0); + } + + #[test] + fn no_sample_without_prior_gc() { + // First GC cycle cannot produce a sample (no previous end time). The + // push happens only when last_gc_end_cpu is Some. + let state = CpuHeapTriggerState::new(); + assert!(state.mean_gc_cpu().is_none()); + } +} diff --git a/gc/mmtk/src/heap/mod.rs b/gc/mmtk/src/heap/mod.rs index 6af7c1b..05a35ef 100644 --- a/gc/mmtk/src/heap/mod.rs +++ b/gc/mmtk/src/heap/mod.rs @@ -1,4 +1,9 @@ +mod cpu_heap_trigger; mod ruby_heap_trigger; + +pub use cpu_heap_trigger::CpuHeapTrigger; +pub use cpu_heap_trigger::CpuHeapTriggerConfig; +pub use cpu_heap_trigger::CPU_HEAP_TRIGGER_CONFIG; pub use ruby_heap_trigger::RubyHeapTrigger; pub use ruby_heap_trigger::RubyHeapTriggerConfig; pub use ruby_heap_trigger::RUBY_HEAP_TRIGGER_CONFIG; diff --git a/test/mmtk/test_configuration.rb b/test/mmtk/test_configuration.rb index d44abc4..037383a 100644 --- a/test/mmtk/test_configuration.rb +++ b/test/mmtk/test_configuration.rb @@ -22,7 +22,7 @@ def test_MMTK_THREADS end end - %w(fixed dynamic ruby).each do |heap| + %w(fixed dynamic ruby cpu).each do |heap| define_method(:"test_MMTK_HEAP_MODE_#{heap}") do assert_separately([{ "MMTK_HEAP_MODE" => heap }], <<~RUBY) assert_equal("#{heap}", GC.config[:mmtk_heap_mode])