From 038b42e1d6902748e522f8420a07accc0fed9440 Mon Sep 17 00:00:00 2001 From: Mo Chen Date: Mon, 1 Jun 2026 13:42:34 -0500 Subject: [PATCH] Fix the memory-pressure throttle and its RSS metric The proxy.config.memory.max_usage throttle and the proxy.process.traffic_server.memory.rss gauge are both driven by the MemoryLimit continuation, and both were broken. The throttle was silently dead. MemoryLimit sets the global net_memory_throttle flag, but the only reader in the accept path was removed in 2018 (4bfaf3645), so for years the flag has been written and never consulted. Restore it by honoring net_memory_throttle in check_net_throttle(ACCEPT); gating on ACCEPT keeps outbound CONNECTs flowing so in-flight transactions can complete and release memory. The flag is now std::atomic (relaxed) since it is written on ET_TASK and read on the ET_NET accept path. The RSS reading was wrong. MemoryLimit stored getrusage().ru_maxrss << 10, which is PEAK RSS, not current -- wrong for a gauge named .rss, and it made the throttle latch on once peak crossed the limit and never release. ru_maxrss is also KiB on Linux but bytes on macOS/BSD, so the shift was wrong off-Linux. Read current RSS portably via a new ink_get_current_rss() helper (Linux /proc/self/statm, macOS task_info, FreeBSD sysctl KERN_PROC_PID) and compare current (not peak) RSS in bytes against the limit, so the throttle engages and releases correctly. The metric is tied to the feature. MemoryLimit is now scheduled only when max_usage > 0, and it samples and publishes RSS only while enabled, so the cost of reading RSS and the memory.rss gauge are incurred only when the memory-limit feature is turned on. max_usage requires a restart to change, so this gate is stable for the life of the process. Co-Authored-By: Claude Opus 4.8 --- .../monitoring/statistics/core/general.en.rst | 11 ++- include/tscore/ink_sys_control.h | 13 ++++ src/iocore/net/P_UnixNet.h | 24 +++++-- src/iocore/net/UnixNet.cc | 12 ++-- src/traffic_server/traffic_server.cc | 66 +++++++++-------- src/tscore/CMakeLists.txt | 1 + src/tscore/ink_sys_control.cc | 67 +++++++++++++++++ src/tscore/unit_tests/test_ink_sys_control.cc | 71 +++++++++++++++++++ 8 files changed, 221 insertions(+), 44 deletions(-) create mode 100644 src/tscore/unit_tests/test_ink_sys_control.cc diff --git a/doc/admin-guide/monitoring/statistics/core/general.en.rst b/doc/admin-guide/monitoring/statistics/core/general.en.rst index 23247760cc5..6479082310d 100644 --- a/doc/admin-guide/monitoring/statistics/core/general.en.rst +++ b/doc/admin-guide/monitoring/statistics/core/general.en.rst @@ -81,5 +81,12 @@ General .. ts:stat:: global proxy.process.traffic_server.memory.rss integer :units: bytes - The resident set size (RSS) of the ``traffic_server`` process. This is - basically the amount of memory this process is consuming. + The current resident set size (RSS) of the ``traffic_server`` process. This + is basically the amount of memory this process is consuming. The value is + refreshed every 10 seconds and reflects current (not peak) usage, so it can + both rise and fall over the lifetime of the process. + + This gauge is only published when the memory-limit feature is enabled (that + is, when :ts:cv:`proxy.config.memory.max_usage` is greater than 0); when the + feature is disabled the process does not sample RSS and the metric is not + reported. diff --git a/include/tscore/ink_sys_control.h b/include/tscore/ink_sys_control.h index 81e24f35ebc..c770376c324 100644 --- a/include/tscore/ink_sys_control.h +++ b/include/tscore/ink_sys_control.h @@ -24,8 +24,21 @@ #pragma once #include +#include rlim_t ink_get_fds_limit(); void ink_set_fds_limit(rlim_t); rlim_t ink_max_out_rlimit(int which); rlim_t ink_get_max_files(); + +/** Get the current resident set size (RSS) of this process, in bytes. + * + * Unlike getrusage(2)'s ru_maxrss, this reports the *current* RSS rather than + * the peak, so it is suitable for a live gauge and can both rise and fall. + * + * Implemented via /proc/self/statm on Linux, task_info() on macOS, and + * sysctl(KERN_PROC_PID) on FreeBSD. + * + * @return current RSS in bytes, or 0 if it could not be determined. + */ +uint64_t ink_get_current_rss(); diff --git a/src/iocore/net/P_UnixNet.h b/src/iocore/net/P_UnixNet.h index 865f5b6c518..4a50e1a5d45 100644 --- a/src/iocore/net/P_UnixNet.h +++ b/src/iocore/net/P_UnixNet.h @@ -23,6 +23,8 @@ #pragma once +#include + #include "P_Net.h" #include "P_UnixNetVConnection.h" #include "P_UnixPollDescriptor.h" @@ -42,13 +44,13 @@ PollDescriptor *get_PollDescriptor(EThread *t); using NetContHandler = int (NetHandler::*)(int, void *); using uint32 = unsigned int; -extern ink_hrtime last_throttle_warning; -extern ink_hrtime last_shedding_warning; -extern ink_hrtime emergency_throttle_time; -extern int net_connections_throttle; -extern bool net_memory_throttle; -extern int fds_throttle; -extern ink_hrtime last_transient_accept_error; +extern ink_hrtime last_throttle_warning; +extern ink_hrtime last_shedding_warning; +extern ink_hrtime emergency_throttle_time; +extern int net_connections_throttle; +extern std::atomic net_memory_throttle; +extern int fds_throttle; +extern ink_hrtime last_transient_accept_error; // // Configuration Parameter had to move here to share @@ -105,6 +107,14 @@ check_shedding_warning() TS_INLINE bool check_net_throttle(ThrottleType t) { + // Throttle new inbound connections when resident memory exceeds + // proxy.config.memory.max_usage (set by the MemoryLimit continuation). Only + // ACCEPT is throttled; outbound CONNECTs may be needed to let in-flight + // transactions complete and release memory. + if (t == ACCEPT && net_memory_throttle.load(std::memory_order_relaxed)) { + return true; + } + int connections = net_connections_to_throttle(t); if (net_connections_throttle != 0 && connections >= net_connections_throttle) { diff --git a/src/iocore/net/UnixNet.cc b/src/iocore/net/UnixNet.cc index d26322a5a80..ac37fb9567f 100644 --- a/src/iocore/net/UnixNet.cc +++ b/src/iocore/net/UnixNet.cc @@ -31,12 +31,12 @@ #include "iocore/io_uring/IO_URING.h" #endif -ink_hrtime last_throttle_warning; -ink_hrtime last_shedding_warning; -int net_connections_throttle; -bool net_memory_throttle = false; -int fds_throttle; -ink_hrtime last_transient_accept_error; +ink_hrtime last_throttle_warning; +ink_hrtime last_shedding_warning; +int net_connections_throttle; +std::atomic net_memory_throttle = false; +int fds_throttle; +ink_hrtime last_transient_accept_error; NetHandler::Config NetHandler::global_config; std::bitset::digits> NetHandler::active_thread_types; diff --git a/src/traffic_server/traffic_server.cc b/src/traffic_server/traffic_server.cc index a9ea60fcab1..65a2ed139ec 100644 --- a/src/traffic_server/traffic_server.cc +++ b/src/traffic_server/traffic_server.cc @@ -472,7 +472,6 @@ class MemoryLimit : public Continuation public: MemoryLimit() : Continuation(new_ProxyMutex()) { - memset(&_usage, 0, sizeof(_usage)); SET_HANDLER(&MemoryLimit::periodic); memory_rss = Metrics::Gauge::createPtr("proxy.process.traffic_server.memory.rss"); } @@ -489,41 +488,44 @@ class MemoryLimit : public Continuation return EVENT_DONE; } - // "reload" the setting, we don't do this often so not expensive + // "reload" the setting, we don't do this often so not expensive. + // proxy.config.memory.max_usage is in bytes; 0 (the default) disables the + // feature. We only schedule this continuation when it is enabled, but + // re-check here defensively and stop monitoring if it is ever cleared. _memory_limit = RecGetRecordInt("proxy.config.memory.max_usage").value_or(0); - _memory_limit = _memory_limit >> 10; // divide by 1024 - - if (getrusage(RUSAGE_SELF, &_usage) == 0) { - ts::Metrics::Gauge::store(memory_rss, _usage.ru_maxrss << 10); // * 1024 - Dbg(dbg_ctl_server, "memory usage - ru_maxrss: %ld memory limit: %" PRId64, _usage.ru_maxrss, _memory_limit); - if (_memory_limit > 0) { - if (_usage.ru_maxrss > _memory_limit) { - if (net_memory_throttle == false) { - net_memory_throttle = true; - Dbg(dbg_ctl_server, "memory usage exceeded limit - ru_maxrss: %ld memory limit: %" PRId64, _usage.ru_maxrss, - _memory_limit); - } - } else { - if (net_memory_throttle == true) { - net_memory_throttle = false; - Dbg(dbg_ctl_server, "memory usage under limit - ru_maxrss: %ld memory limit: %" PRId64, _usage.ru_maxrss, - _memory_limit); - } - } - } else { - // this feature has not been enabled - Dbg(dbg_ctl_server, "limiting connections based on memory usage has been disabled"); - e->cancel(); - delete this; - return EVENT_DONE; + if (_memory_limit <= 0) { + Dbg(dbg_ctl_server, "limiting connections based on memory usage has been disabled"); + e->cancel(); + delete this; + return EVENT_DONE; + } + + // Sample and publish the *current* RSS only while the feature is enabled, + // so we incur the cost of reading RSS only when it is needed. + // ink_get_current_rss() reports current (not peak) RSS in bytes, portably, + // so the gauge can both rise and fall and the throttle releases correctly. + uint64_t rss = ink_get_current_rss(); + ts::Metrics::Gauge::store(memory_rss, static_cast(rss)); + Dbg(dbg_ctl_server, "memory usage - current rss: %" PRIu64 " bytes memory limit: %" PRId64 " bytes", rss, _memory_limit); + + // net_memory_throttle is read on accept threads, so use relaxed atomics. + if (rss > static_cast(_memory_limit)) { + if (net_memory_throttle.load(std::memory_order_relaxed) == false) { + net_memory_throttle.store(true, std::memory_order_relaxed); + Dbg(dbg_ctl_server, "memory usage exceeded limit - current rss: %" PRIu64 " memory limit: %" PRId64, rss, _memory_limit); + } + } else { + if (net_memory_throttle.load(std::memory_order_relaxed) == true) { + net_memory_throttle.store(false, std::memory_order_relaxed); + Dbg(dbg_ctl_server, "memory usage under limit - current rss: %" PRIu64 " memory limit: %" PRId64, rss, _memory_limit); } } + return EVENT_CONT; } private: int64_t _memory_limit = 0; - struct rusage _usage; Metrics::Gauge::AtomicType *memory_rss; }; @@ -2273,7 +2275,13 @@ main(int /* argc ATS_UNUSED */, const char **argv) eventProcessor.schedule_every(new SignalContinuation, HRTIME_MSECOND * 500, ET_CALL); eventProcessor.schedule_every(new DiagsLogContinuation, HRTIME_SECOND, ET_TASK); - eventProcessor.schedule_every(new MemoryLimit, HRTIME_SECOND * 10, ET_TASK); + // Only monitor RSS and enforce the memory limit when the feature is enabled. + // proxy.config.memory.max_usage requires a restart to change, so this gate is + // stable for the life of the process. This also avoids reading RSS and + // publishing the memory.rss gauge when the feature is off. + if (RecGetRecordInt("proxy.config.memory.max_usage").value_or(0) > 0) { + eventProcessor.schedule_every(new MemoryLimit, HRTIME_SECOND * 10, ET_TASK); + } RecRegisterConfigUpdateCb("proxy.config.dump_mem_info_frequency", init_memory_tracker, nullptr); init_memory_tracker(nullptr, RECD_NULL, RecData(), nullptr); diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt index 7790adc87dd..ed41d5ed60e 100644 --- a/src/tscore/CMakeLists.txt +++ b/src/tscore/CMakeLists.txt @@ -161,6 +161,7 @@ if(BUILD_TESTING) unit_tests/test_ink_inet.cc unit_tests/test_ink_memory.cc unit_tests/test_ink_string.cc + unit_tests/test_ink_sys_control.cc unit_tests/test_layout.cc unit_tests/test_scoped_resource.cc unit_tests/test_Version.cc diff --git a/src/tscore/ink_sys_control.cc b/src/tscore/ink_sys_control.cc index 79898c9ce6d..8a76b0591dc 100644 --- a/src/tscore/ink_sys_control.cc +++ b/src/tscore/ink_sys_control.cc @@ -22,12 +22,24 @@ */ #include +#include +#include #include "tscore/ink_defs.h" #include "tscore/ink_assert.h" #include "tscore/ink_sys_control.h" #include "tscore/Diags.h" +#if defined(darwin) +#include +#endif + +#if defined(freebsd) +#include +#include +#include +#endif + namespace { rlim_t global_fds_limit = 8000; @@ -98,3 +110,58 @@ ink_get_max_files() return RLIM_INFINITY; } + +uint64_t +ink_get_current_rss() +{ +#if defined(__linux__) + // /proc/self/statm reports sizes in pages. The second field is the resident + // set size (number of resident pages). + FILE *fp = fopen("/proc/self/statm", "r"); + if (fp == nullptr) { + return 0; + } + + unsigned long total_pages = 0; + unsigned long resident_pages = 0; + int matched = fscanf(fp, "%lu %lu", &total_pages, &resident_pages); + fclose(fp); + + if (matched != 2) { + return 0; + } + + long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return 0; + } + + return static_cast(resident_pages) * static_cast(page_size); +#elif defined(darwin) + // On macOS, task_info() reports resident_size in bytes. + mach_task_basic_info_data_t info; + mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT; + if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, reinterpret_cast(&info), &count) != KERN_SUCCESS) { + return 0; + } + + return static_cast(info.resident_size); +#elif defined(freebsd) + // On FreeBSD, kinfo_proc.ki_rssize is the resident set size in pages. + struct kinfo_proc kp; + size_t len = sizeof(kp); + int mib[] = {CTL_KERN, KERN_PROC, KERN_PROC_PID, getpid()}; + if (sysctl(mib, sizeof(mib) / sizeof(mib[0]), &kp, &len, nullptr, 0) != 0 || len != sizeof(kp)) { + return 0; + } + + long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return 0; + } + + return static_cast(kp.ki_rssize) * static_cast(page_size); +#else + return 0; +#endif +} diff --git a/src/tscore/unit_tests/test_ink_sys_control.cc b/src/tscore/unit_tests/test_ink_sys_control.cc new file mode 100644 index 00000000000..67524c22550 --- /dev/null +++ b/src/tscore/unit_tests/test_ink_sys_control.cc @@ -0,0 +1,71 @@ +/** @file + + test ink_sys_control.h - system resource helpers + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +#if defined(__linux__) || defined(darwin) || defined(freebsd) +#include +#include +#include +#endif + +TEST_CASE("ink_get_current_rss reports a plausible current RSS", "[ink_sys_control]") +{ +#if defined(__linux__) || defined(darwin) || defined(freebsd) + uint64_t rss = ink_get_current_rss(); + + // The test process is obviously resident, so current RSS must be non-zero + // and well above a trivially small floor (at least one page). + REQUIRE(rss > 0); + REQUIRE(rss >= 4096); + + SECTION("RSS grows when we touch a large fresh mapping") + { + uint64_t before = ink_get_current_rss(); + + // Map ~64 MiB of fresh anonymous memory. Unlike a heap allocation, these + // pages are guaranteed not to have been resident before, so touching every + // page deterministically increases RSS. + constexpr size_t bytes = 64 * 1024 * 1024; + void *mapping = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + REQUIRE(mapping != MAP_FAILED); + + long page_size = sysconf(_SC_PAGESIZE); + REQUIRE(page_size > 0); + + volatile char *buf = static_cast(mapping); + for (size_t i = 0; i < bytes; i += static_cast(page_size)) { + buf[i] = static_cast(i); + } + + uint64_t after = ink_get_current_rss(); + REQUIRE(after > before); + + munmap(mapping, bytes); + } +#else + // On unsupported platforms the helper is documented to return 0. + REQUIRE(ink_get_current_rss() == 0); +#endif +}