diff --git a/doc/admin-guide/monitoring/statistics/core/general.en.rst b/doc/admin-guide/monitoring/statistics/core/general.en.rst index 23247760cc5..6479082310d 100644 --- a/doc/admin-guide/monitoring/statistics/core/general.en.rst +++ b/doc/admin-guide/monitoring/statistics/core/general.en.rst @@ -81,5 +81,12 @@ General .. ts:stat:: global proxy.process.traffic_server.memory.rss integer :units: bytes - The resident set size (RSS) of the ``traffic_server`` process. This is - basically the amount of memory this process is consuming. + The current resident set size (RSS) of the ``traffic_server`` process. This + is basically the amount of memory this process is consuming. The value is + refreshed every 10 seconds and reflects current (not peak) usage, so it can + both rise and fall over the lifetime of the process. + + This gauge is only published when the memory-limit feature is enabled (that + is, when :ts:cv:`proxy.config.memory.max_usage` is greater than 0); when the + feature is disabled the process does not sample RSS and the metric is not + reported. diff --git a/include/tscore/ink_sys_control.h b/include/tscore/ink_sys_control.h index 81e24f35ebc..c770376c324 100644 --- a/include/tscore/ink_sys_control.h +++ b/include/tscore/ink_sys_control.h @@ -24,8 +24,21 @@ #pragma once #include +#include rlim_t ink_get_fds_limit(); void ink_set_fds_limit(rlim_t); rlim_t ink_max_out_rlimit(int which); rlim_t ink_get_max_files(); + +/** Get the current resident set size (RSS) of this process, in bytes. + * + * Unlike getrusage(2)'s ru_maxrss, this reports the *current* RSS rather than + * the peak, so it is suitable for a live gauge and can both rise and fall. + * + * Implemented via /proc/self/statm on Linux, task_info() on macOS, and + * sysctl(KERN_PROC_PID) on FreeBSD. + * + * @return current RSS in bytes, or 0 if it could not be determined. + */ +uint64_t ink_get_current_rss(); diff --git a/src/iocore/net/P_UnixNet.h b/src/iocore/net/P_UnixNet.h index 865f5b6c518..4a50e1a5d45 100644 --- a/src/iocore/net/P_UnixNet.h +++ b/src/iocore/net/P_UnixNet.h @@ -23,6 +23,8 @@ #pragma once +#include + #include "P_Net.h" #include "P_UnixNetVConnection.h" #include "P_UnixPollDescriptor.h" @@ -42,13 +44,13 @@ PollDescriptor *get_PollDescriptor(EThread *t); using NetContHandler = int (NetHandler::*)(int, void *); using uint32 = unsigned int; -extern ink_hrtime last_throttle_warning; -extern ink_hrtime last_shedding_warning; -extern ink_hrtime emergency_throttle_time; -extern int net_connections_throttle; -extern bool net_memory_throttle; -extern int fds_throttle; -extern ink_hrtime last_transient_accept_error; +extern ink_hrtime last_throttle_warning; +extern ink_hrtime last_shedding_warning; +extern ink_hrtime emergency_throttle_time; +extern int net_connections_throttle; +extern std::atomic net_memory_throttle; +extern int fds_throttle; +extern ink_hrtime last_transient_accept_error; // // Configuration Parameter had to move here to share @@ -105,6 +107,14 @@ check_shedding_warning() TS_INLINE bool check_net_throttle(ThrottleType t) { + // Throttle new inbound connections when resident memory exceeds + // proxy.config.memory.max_usage (set by the MemoryLimit continuation). Only + // ACCEPT is throttled; outbound CONNECTs may be needed to let in-flight + // transactions complete and release memory. + if (t == ACCEPT && net_memory_throttle.load(std::memory_order_relaxed)) { + return true; + } + int connections = net_connections_to_throttle(t); if (net_connections_throttle != 0 && connections >= net_connections_throttle) { diff --git a/src/iocore/net/UnixNet.cc b/src/iocore/net/UnixNet.cc index d26322a5a80..ac37fb9567f 100644 --- a/src/iocore/net/UnixNet.cc +++ b/src/iocore/net/UnixNet.cc @@ -31,12 +31,12 @@ #include "iocore/io_uring/IO_URING.h" #endif -ink_hrtime last_throttle_warning; -ink_hrtime last_shedding_warning; -int net_connections_throttle; -bool net_memory_throttle = false; -int fds_throttle; -ink_hrtime last_transient_accept_error; +ink_hrtime last_throttle_warning; +ink_hrtime last_shedding_warning; +int net_connections_throttle; +std::atomic net_memory_throttle = false; +int fds_throttle; +ink_hrtime last_transient_accept_error; NetHandler::Config NetHandler::global_config; std::bitset::digits> NetHandler::active_thread_types; diff --git a/src/traffic_server/traffic_server.cc b/src/traffic_server/traffic_server.cc index a9ea60fcab1..65a2ed139ec 100644 --- a/src/traffic_server/traffic_server.cc +++ b/src/traffic_server/traffic_server.cc @@ -472,7 +472,6 @@ class MemoryLimit : public Continuation public: MemoryLimit() : Continuation(new_ProxyMutex()) { - memset(&_usage, 0, sizeof(_usage)); SET_HANDLER(&MemoryLimit::periodic); memory_rss = Metrics::Gauge::createPtr("proxy.process.traffic_server.memory.rss"); } @@ -489,41 +488,44 @@ class MemoryLimit : public Continuation return EVENT_DONE; } - // "reload" the setting, we don't do this often so not expensive + // "reload" the setting, we don't do this often so not expensive. + // proxy.config.memory.max_usage is in bytes; 0 (the default) disables the + // feature. We only schedule this continuation when it is enabled, but + // re-check here defensively and stop monitoring if it is ever cleared. _memory_limit = RecGetRecordInt("proxy.config.memory.max_usage").value_or(0); - _memory_limit = _memory_limit >> 10; // divide by 1024 - - if (getrusage(RUSAGE_SELF, &_usage) == 0) { - ts::Metrics::Gauge::store(memory_rss, _usage.ru_maxrss << 10); // * 1024 - Dbg(dbg_ctl_server, "memory usage - ru_maxrss: %ld memory limit: %" PRId64, _usage.ru_maxrss, _memory_limit); - if (_memory_limit > 0) { - if (_usage.ru_maxrss > _memory_limit) { - if (net_memory_throttle == false) { - net_memory_throttle = true; - Dbg(dbg_ctl_server, "memory usage exceeded limit - ru_maxrss: %ld memory limit: %" PRId64, _usage.ru_maxrss, - _memory_limit); - } - } else { - if (net_memory_throttle == true) { - net_memory_throttle = false; - Dbg(dbg_ctl_server, "memory usage under limit - ru_maxrss: %ld memory limit: %" PRId64, _usage.ru_maxrss, - _memory_limit); - } - } - } else { - // this feature has not been enabled - Dbg(dbg_ctl_server, "limiting connections based on memory usage has been disabled"); - e->cancel(); - delete this; - return EVENT_DONE; + if (_memory_limit <= 0) { + Dbg(dbg_ctl_server, "limiting connections based on memory usage has been disabled"); + e->cancel(); + delete this; + return EVENT_DONE; + } + + // Sample and publish the *current* RSS only while the feature is enabled, + // so we incur the cost of reading RSS only when it is needed. + // ink_get_current_rss() reports current (not peak) RSS in bytes, portably, + // so the gauge can both rise and fall and the throttle releases correctly. + uint64_t rss = ink_get_current_rss(); + ts::Metrics::Gauge::store(memory_rss, static_cast(rss)); + Dbg(dbg_ctl_server, "memory usage - current rss: %" PRIu64 " bytes memory limit: %" PRId64 " bytes", rss, _memory_limit); + + // net_memory_throttle is read on accept threads, so use relaxed atomics. + if (rss > static_cast(_memory_limit)) { + if (net_memory_throttle.load(std::memory_order_relaxed) == false) { + net_memory_throttle.store(true, std::memory_order_relaxed); + Dbg(dbg_ctl_server, "memory usage exceeded limit - current rss: %" PRIu64 " memory limit: %" PRId64, rss, _memory_limit); + } + } else { + if (net_memory_throttle.load(std::memory_order_relaxed) == true) { + net_memory_throttle.store(false, std::memory_order_relaxed); + Dbg(dbg_ctl_server, "memory usage under limit - current rss: %" PRIu64 " memory limit: %" PRId64, rss, _memory_limit); } } + return EVENT_CONT; } private: int64_t _memory_limit = 0; - struct rusage _usage; Metrics::Gauge::AtomicType *memory_rss; }; @@ -2273,7 +2275,13 @@ main(int /* argc ATS_UNUSED */, const char **argv) eventProcessor.schedule_every(new SignalContinuation, HRTIME_MSECOND * 500, ET_CALL); eventProcessor.schedule_every(new DiagsLogContinuation, HRTIME_SECOND, ET_TASK); - eventProcessor.schedule_every(new MemoryLimit, HRTIME_SECOND * 10, ET_TASK); + // Only monitor RSS and enforce the memory limit when the feature is enabled. + // proxy.config.memory.max_usage requires a restart to change, so this gate is + // stable for the life of the process. This also avoids reading RSS and + // publishing the memory.rss gauge when the feature is off. + if (RecGetRecordInt("proxy.config.memory.max_usage").value_or(0) > 0) { + eventProcessor.schedule_every(new MemoryLimit, HRTIME_SECOND * 10, ET_TASK); + } RecRegisterConfigUpdateCb("proxy.config.dump_mem_info_frequency", init_memory_tracker, nullptr); init_memory_tracker(nullptr, RECD_NULL, RecData(), nullptr); diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt index 7790adc87dd..ed41d5ed60e 100644 --- a/src/tscore/CMakeLists.txt +++ b/src/tscore/CMakeLists.txt @@ -161,6 +161,7 @@ if(BUILD_TESTING) unit_tests/test_ink_inet.cc unit_tests/test_ink_memory.cc unit_tests/test_ink_string.cc + unit_tests/test_ink_sys_control.cc unit_tests/test_layout.cc unit_tests/test_scoped_resource.cc unit_tests/test_Version.cc diff --git a/src/tscore/ink_sys_control.cc b/src/tscore/ink_sys_control.cc index 79898c9ce6d..8a76b0591dc 100644 --- a/src/tscore/ink_sys_control.cc +++ b/src/tscore/ink_sys_control.cc @@ -22,12 +22,24 @@ */ #include +#include +#include #include "tscore/ink_defs.h" #include "tscore/ink_assert.h" #include "tscore/ink_sys_control.h" #include "tscore/Diags.h" +#if defined(darwin) +#include +#endif + +#if defined(freebsd) +#include +#include +#include +#endif + namespace { rlim_t global_fds_limit = 8000; @@ -98,3 +110,58 @@ ink_get_max_files() return RLIM_INFINITY; } + +uint64_t +ink_get_current_rss() +{ +#if defined(__linux__) + // /proc/self/statm reports sizes in pages. The second field is the resident + // set size (number of resident pages). + FILE *fp = fopen("/proc/self/statm", "r"); + if (fp == nullptr) { + return 0; + } + + unsigned long total_pages = 0; + unsigned long resident_pages = 0; + int matched = fscanf(fp, "%lu %lu", &total_pages, &resident_pages); + fclose(fp); + + if (matched != 2) { + return 0; + } + + long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return 0; + } + + return static_cast(resident_pages) * static_cast(page_size); +#elif defined(darwin) + // On macOS, task_info() reports resident_size in bytes. + mach_task_basic_info_data_t info; + mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT; + if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, reinterpret_cast(&info), &count) != KERN_SUCCESS) { + return 0; + } + + return static_cast(info.resident_size); +#elif defined(freebsd) + // On FreeBSD, kinfo_proc.ki_rssize is the resident set size in pages. + struct kinfo_proc kp; + size_t len = sizeof(kp); + int mib[] = {CTL_KERN, KERN_PROC, KERN_PROC_PID, getpid()}; + if (sysctl(mib, sizeof(mib) / sizeof(mib[0]), &kp, &len, nullptr, 0) != 0 || len != sizeof(kp)) { + return 0; + } + + long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return 0; + } + + return static_cast(kp.ki_rssize) * static_cast(page_size); +#else + return 0; +#endif +} diff --git a/src/tscore/unit_tests/test_ink_sys_control.cc b/src/tscore/unit_tests/test_ink_sys_control.cc new file mode 100644 index 00000000000..67524c22550 --- /dev/null +++ b/src/tscore/unit_tests/test_ink_sys_control.cc @@ -0,0 +1,71 @@ +/** @file + + test ink_sys_control.h - system resource helpers + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include + +#if defined(__linux__) || defined(darwin) || defined(freebsd) +#include +#include +#include +#endif + +TEST_CASE("ink_get_current_rss reports a plausible current RSS", "[ink_sys_control]") +{ +#if defined(__linux__) || defined(darwin) || defined(freebsd) + uint64_t rss = ink_get_current_rss(); + + // The test process is obviously resident, so current RSS must be non-zero + // and well above a trivially small floor (at least one page). + REQUIRE(rss > 0); + REQUIRE(rss >= 4096); + + SECTION("RSS grows when we touch a large fresh mapping") + { + uint64_t before = ink_get_current_rss(); + + // Map ~64 MiB of fresh anonymous memory. Unlike a heap allocation, these + // pages are guaranteed not to have been resident before, so touching every + // page deterministically increases RSS. + constexpr size_t bytes = 64 * 1024 * 1024; + void *mapping = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + REQUIRE(mapping != MAP_FAILED); + + long page_size = sysconf(_SC_PAGESIZE); + REQUIRE(page_size > 0); + + volatile char *buf = static_cast(mapping); + for (size_t i = 0; i < bytes; i += static_cast(page_size)) { + buf[i] = static_cast(i); + } + + uint64_t after = ink_get_current_rss(); + REQUIRE(after > before); + + munmap(mapping, bytes); + } +#else + // On unsupported platforms the helper is documented to return 0. + REQUIRE(ink_get_current_rss() == 0); +#endif +}