From d7d8153e13e40aede35ae3be815a49b65fbea49e Mon Sep 17 00:00:00 2001 From: tengweicai Date: Thu, 23 Apr 2026 15:20:39 +0800 Subject: [PATCH] fix: clear errno after push_back in listDir to prevent false ENOMEM crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vector::push_back may internally call malloc→mmap which can transiently fail and set errno=ENOMEM even when the allocation ultimately succeeds via a different path. Since POSIX readdir does not clear errno on EOF, the stale ENOMEM survives to the GLOO_ENFORCE(errno == 0) check after the loop, causing a spurious crash. This is observed in practice on machines with many PCI devices (600+) under memory pressure from concurrent process spawning. --- gloo/common/linux.cc | 1 + gloo/test/linux_test.cc | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/gloo/common/linux.cc b/gloo/common/linux.cc index 9b45fc314..88f4855c6 100644 --- a/gloo/common/linux.cc +++ b/gloo/common/linux.cc @@ -74,6 +74,7 @@ static std::vector listDir(const std::string& path) { continue; } result.push_back(dirent->d_name); + errno = 0; } GLOO_ENFORCE(errno == 0, strerror(errno)); auto rv = closedir(dirp); diff --git a/gloo/test/linux_test.cc b/gloo/test/linux_test.cc index 2a1a7cb4c..39d5d1f7d 100644 --- a/gloo/test/linux_test.cc +++ b/gloo/test/linux_test.cc @@ -6,6 +6,8 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include #include @@ -45,6 +47,15 @@ TEST_F(LinuxTest, PCIDistance) { } } +// Verify that listDir (called by pciDevices) tolerates stale errno left by +// prior allocations. Before the fix, vector::push_back inside the readdir +// loop could leave errno=ENOMEM from a transient mmap failure, causing +// GLOO_ENFORCE(errno == 0) to crash after readdir returns NULL at EOF. +TEST_F(LinuxTest, PciDevicesDoesNotCrashWithStaleErrno) { + errno = ENOMEM; + ASSERT_NO_THROW(pciDevices(kPCIClassNetwork)); +} + } // namespace } // namespace test } // namespace gloo