From 03232456f728d81e009358f8dc39f0f2d5690585 Mon Sep 17 00:00:00 2001 From: kevin Heifner Date: Tue, 19 May 2026 09:05:34 -0500 Subject: [PATCH] rt: add 128 / (<=32-bit divisor) fast path to udivmod128 udivmod128 fell back to a 128-iteration shift/subtract loop whenever the dividend or divisor exceeded 64 bits. Most contract 128-bit divisions have a small divisor (asset math, fixed-point scaling), so add a schoolbook base-2^32 long division: the 128-bit dividend is processed as four 32-bit digits -- 4 native i64.div_u instead of 128 loop iterations. The running remainder satisfies r < v <= 2^32-1 after each step, so (r << 32) | digit < 2^64 (fits a native uint64, no recursion back into __udivti3) and < v * 2^32 (each per-digit quotient < 2^32, so the (hi<<32)|lo packing is exact). Routing is now 64/64 -> 128/(<=32-bit) -> 128-iteration loop. Verified bit-identical to the slow loop over 16M random + boundary inputs, so this is a pure optimization with no determinism change. Adds udivti3/umodti3_small_divisor_fastpath covering the digit-carry chain, the 0xFFFFFFFF boundary, and the v==2^32 slow-loop fall-through. --- libraries/rt/compiler_builtins.cpp | 24 +++++++++++ tests/unit/compiler_builtins_tests.cpp | 55 ++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/libraries/rt/compiler_builtins.cpp b/libraries/rt/compiler_builtins.cpp index eca7c1fee..8eb83fcb0 100644 --- a/libraries/rt/compiler_builtins.cpp +++ b/libraries/rt/compiler_builtins.cpp @@ -64,6 +64,30 @@ namespace { *r_hi = 0; return; } + // Fast path: 128-bit / (<= 32-bit divisor). Schoolbook long division + // with the dividend split into four 32-bit digits, most significant + // first. The running remainder r satisfies r < v <= 2^32-1 after every + // `r %= v`, so the next dividend (r << 32) | digit < 2^64 -- it fits a + // native uint64 and the i64.div_u below is exact (no recursion back into + // __udivti3). The same bound gives (r << 32) | digit < v * 2^32, so each + // per-digit quotient is < 2^32 and the (hi << 32) | lo packing is exact. + // Since `digit < 2^32`, every `(r << 32) | digit` is a disjoint-bit add. + if (v_hi == 0 && v_lo <= 0xFFFFFFFFULL) { + const uint64_t v = v_lo; + uint64_t r = u_hi >> 32; // digit 3 (top) + const uint64_t q3 = r / v; r %= v; + r = (r << 32) | (u_hi & 0xFFFFFFFFULL); // digit 2 + const uint64_t q2 = r / v; r %= v; + r = (r << 32) | (u_lo >> 32); // digit 1 + const uint64_t q1 = r / v; r %= v; + r = (r << 32) | (u_lo & 0xFFFFFFFFULL); // digit 0 (bottom) + const uint64_t q0 = r / v; r %= v; + *q_hi = (q3 << 32) | q2; + *q_lo = (q1 << 32) | q0; + *r_lo = r; // r < v <= 2^32-1 + *r_hi = 0; + return; + } uint64_t ql = 0, qh = 0; uint64_t rl = 0, rh = 0; for (int i = 127; i >= 0; --i) { diff --git a/tests/unit/compiler_builtins_tests.cpp b/tests/unit/compiler_builtins_tests.cpp index 0e0c47ade..cf2ae16de 100644 --- a/tests/unit/compiler_builtins_tests.cpp +++ b/tests/unit/compiler_builtins_tests.cpp @@ -50,6 +50,15 @@ namespace { inline uint64_t hi (s128 v) { return static_cast(static_cast(v) >> 64); } inline uint64_t ulo(u128 v) { return static_cast(v); } inline uint64_t uhi(u128 v) { return static_cast(v >> 64); } + + // Build a u128 from (lo, hi) using only a constant-distance shift and OR -- + // no u128 `/ % *`, so this does not lower to a __udivti3/__multi3 call that + // would collide with libnative_rt's custom signatures (see note above + // udivti3_basic). Expected values for the div/mod tests are precomputed + // literals fed through this. + inline u128 mk(uint64_t lo_, uint64_t hi_) { + return (static_cast(hi_) << 64) | static_cast(lo_); + } } // ===================================================================== @@ -184,6 +193,50 @@ SYSIO_TEST_BEGIN(umodti3_basic) CHECK_EQUAL(r, static_cast(0)) SYSIO_TEST_END +// ===================================================================== +// udivmod128 small-divisor fast path: 128-bit dividend / (<= 32-bit +// divisor). Exercises the four-32-bit-digit long division, its +// 0xFFFFFFFF upper boundary, and the just-over-boundary divisor +// (v == 2^32) that must fall through to the 128-iteration slow loop. +// Vectors precomputed in Python; full-128 dividends so the 64/64 fast +// path is skipped and the new path (or the slow loop) is taken. +// ===================================================================== +SYSIO_TEST_BEGIN(udivti3_small_divisor_fastpath) + u128 r = 0; + // UINT128_MAX / 7 -- non-zero remainder carried across all four digits. + __udivti3(r, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 7, 0); + CHECK_EQUAL(r, mk(0x4924924924924924ULL, 0x2492492492492492ULL)) + // UINT128_MAX / 0xFFFFFFFF -- largest divisor still on the fast path. + __udivti3(r, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFULL, 0); + CHECK_EQUAL(r, mk(0x0000000100000001ULL, 0x0000000100000001ULL)) + // UINT128_MAX / 1 -- identity: q == u, r == 0 on the fast path. + __udivti3(r, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 1, 0); + CHECK_EQUAL(r, mk(0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL)) + // Mixed dividend / 0xDEADBEEF (divisor just under 2^32). + __udivti3(r, 0xFEDCBA9876543210ULL, 0x0123456789ABCDEFULL, 0xDEADBEEFULL, 0); + CHECK_EQUAL(r, mk(0x717DCE0520562DA1ULL, 0x00000000014EDB42ULL)) + // v == 2^32: one past the fast-path guard -> 128-iteration slow loop. + __udivti3(r, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0x100000000ULL, 0); + CHECK_EQUAL(r, mk(0xFFFFFFFFFFFFFFFFULL, 0x00000000FFFFFFFFULL)) +SYSIO_TEST_END + +SYSIO_TEST_BEGIN(umodti3_small_divisor_fastpath) + u128 r = 0; + // Remainders for the same vectors; result is always < v <= 2^32-1. + __umodti3(r, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 7, 0); + CHECK_EQUAL(r, static_cast(3)) + __umodti3(r, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFULL, 0); + CHECK_EQUAL(r, static_cast(0)) + __umodti3(r, 0xFEDCBA9876543210ULL, 0x0123456789ABCDEFULL, 0xDEADBEEFULL, 0); + CHECK_EQUAL(r, static_cast(0xDC351AC1ULL)) + // 0xFFFF...FFFF_0000...0001 / 3 -> remainder 1 (carry chain ends nonzero). + __umodti3(r, 0x0000000000000001ULL, 0xFFFFFFFFFFFFFFFFULL, 3, 0); + CHECK_EQUAL(r, static_cast(1)) + // v == 2^32: slow-loop remainder still exact. + __umodti3(r, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0x100000000ULL, 0); + CHECK_EQUAL(r, static_cast(0xFFFFFFFFULL)) +SYSIO_TEST_END + // ===================================================================== // __ashlti3 / __lshlti3 -- left shift (same bit pattern) // ===================================================================== @@ -352,6 +405,8 @@ int main(int argc, char* argv[]) { SYSIO_TEST(modti3_basic); SYSIO_TEST(modti3_int128_min_wrap); SYSIO_TEST(umodti3_basic); + SYSIO_TEST(udivti3_small_divisor_fastpath); + SYSIO_TEST(umodti3_small_divisor_fastpath); SYSIO_TEST(shift_left_basic); SYSIO_TEST(shift_left_oob_saturates_to_zero); SYSIO_TEST(lshrti3_basic);