Skip to content

Commit ffebbcd

Browse files
committed
perftest: fix premature exit when select() is interrupted by SIGALRM
The perftest framework makes extensive use of alarm() to control test duration (--duration) and to schedule periodic tasks. Functions such as `run_iter_bw()`, `run_iter_lat_send()`, and `run_iter_bi()` install a handler via `signal(SIGALRM, catch_alarm)` when the -D option is used, and then set an alarm. In `run_iter_bw_server()` and `run_iter_bi()`, a watchdog is also installed in iterations mode via `signal(SIGALRM, check_alive)` followed by `alarm(60)` to detect stalled tests. In the problematic case, `run_iter_bi()` with the -e option invokes `ctx_notify_send_recv_events()`, which performs a `select()` on two file descriptors: `ctx->recv_channel->fd` — CQ receive completion channel `ctx->send_channel->fd` — CQ send completion channel When a completion event is generated, the kernel marks the corresponding file descriptor readable and `select()` returns. However, due to low processing speed on the some NICs, no completion event is generated within 60 seconds(test case is not finished under high pressure test). The watchdog `alarm()` fires, delivering SIGALRM, which interrupts the blocking `select()` call. The function then exits with an error instead of retrying. This behavior exposes a robustness issue in perftest: SIGALRM in this context is meant only as a check-alive signal, not as a fatal condition. A `select()` call interrupted by SIGALRM should be restarted rather than causing an unexpected termination. This patch updates perftest to properly handle EINTR by retrying `select()` when it is interrupted by SIGALRM, ensuring correct behavior even under slow device processing conditions. Signed-off-by: Ruizhe Zhou <zhouruizhe@resnics.com>
1 parent a9f846e commit ffebbcd

3 files changed

Lines changed: 33 additions & 9 deletions

File tree

src/perftest_communication.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1294,9 +1294,10 @@ int rdma_client_connect(struct pingpong_context *ctx,struct perftest_parameters
12941294
}
12951295

12961296
if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
1297-
fprintf(stderr, "Unexpected CM event bl blka %d\n", event->event);
1297+
fprintf(stderr, "Unexpected CM event bl blka %s; error: %d.\n",
1298+
rdma_event_str(event->event), event->status);
12981299
rdma_ack_cm_event(event);
1299-
return FAILURE;
1300+
return FAILURE;
13001301
}
13011302

13021303
if (user_param->connection_type == UD) {

src/perftest_resources.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ static __always_inline int poll_completions(
110110
struct perftest_parameters* duration_param;
111111
struct check_alive_data check_alive_data;
112112

113+
volatile sig_atomic_t g_sigalarm_fired = 0;
113114

114115
/******************************************************************************
115116
* Beginning
@@ -6621,6 +6622,7 @@ uint16_t ctx_get_local_lid(struct ibv_context *context,int port)
66216622
******************************************************************************/
66226623
void catch_alarm(int sig)
66236624
{
6625+
g_sigalarm_fired = 1;
66246626
switch (duration_param->state) {
66256627
case START_STATE:
66266628
duration_param->state = SAMPLE_STATE;
@@ -6648,6 +6650,7 @@ void catch_alarm(int sig)
66486650

66496651
void check_alive(int sig)
66506652
{
6653+
g_sigalarm_fired = 1;
66516654
if (check_alive_data.current_totrcnt > check_alive_data.last_totrcnt) {
66526655
check_alive_data.last_totrcnt = check_alive_data.current_totrcnt;
66536656
alarm(60);

src/perftest_resources.h

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@
7272
#include <sys/socket.h>
7373
#include <netdb.h>
7474
#include <fcntl.h>
75+
#include <string.h>
76+
#include <errno.h>
77+
#include <signal.h>
7578
#include "perftest_parameters.h"
7679

7780
#ifdef HAVE_CUDA
@@ -201,6 +204,7 @@ static inline uint64_t build_wr_id(uint32_t wr_index, uint16_t qp_index)
201204
return ((uint64_t)wr_index) | ((uint64_t)qp_index << WR_ID_QP_INDEX_OFFSET);
202205
}
203206

207+
extern volatile sig_atomic_t g_sigalarm_fired;
204208
/******************************************************************************
205209
* Perftest resources Structures and data types.
206210
******************************************************************************/
@@ -942,15 +946,31 @@ static __inline void increase_rem_addr(struct ibv_send_wr *wr,int size,uint64_t
942946
static __inline int ctx_notify_send_recv_events(struct pingpong_context *ctx)
943947
{
944948
fd_set rfds;
949+
int ret;
945950

946-
FD_ZERO(&rfds);
947-
FD_SET(ctx->recv_channel->fd, &rfds);
948-
FD_SET(ctx->send_channel->fd, &rfds);
951+
do {
952+
FD_ZERO(&rfds);
953+
FD_SET(ctx->recv_channel->fd, &rfds);
954+
FD_SET(ctx->send_channel->fd, &rfds);
949955

950-
if (select(MAX(ctx->recv_channel->fd,
951-
ctx->send_channel->fd) + 1,
952-
&rfds, NULL, NULL, NULL) == -1) {
953-
fprintf(stderr, "Failed to get completion events\n");
956+
g_sigalarm_fired = 0;
957+
958+
ret = select(MAX(ctx->recv_channel->fd,
959+
ctx->send_channel->fd) + 1,
960+
&rfds, NULL, NULL, NULL);
961+
962+
if (ret == -1 && errno == EINTR) {
963+
if (g_sigalarm_fired) {
964+
fprintf(stderr, "Confirmed: select() was interrupted by SIGALARM. Retrying...\n");
965+
} else {
966+
fprintf(stderr, "Warning: select() interrupted by another signal. Retrying...\n");
967+
}
968+
}
969+
970+
} while (ret == -1 && errno == EINTR);
971+
972+
if (ret == -1) {
973+
fprintf(stderr, "Failed to get completion events: %s\n", strerror(errno));
954974
return FAILURE;
955975
}
956976

0 commit comments

Comments
 (0)