Skip to content

Commit 0d16808

Browse files
committed
Instrument background processor loop timing
Scaling to 16 tokio workers and a larger instance had zero impact on the heartbeat polling delay metric, ruling out worker starvation as the cause of the remaining 1-2s jitter (and 4s spikes every 60s). The jitter must come from synchronous operations inside the process_events_async loop that block a worker thread without yielding. The main suspects are peer_manager.process_events() (blocks on ChannelManager locks every iteration) and channel_manager.timer_tick_occurred() (fires every 60s, correlates with the 4s spikes). Wraps every synchronous call in the loop with Instant timing and logs a full breakdown whenever a loop iteration exceeds 500ms.
1 parent b89fc71 commit 0d16808

1 file changed

Lines changed: 48 additions & 0 deletions

File tree

  • lightning-background-processor/src

lightning-background-processor/src/lib.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -987,11 +987,21 @@ where
987987
let mut last_forwards_processing_call = sleeper(batch_delay.get());
988988

989989
loop {
990+
let loop_start = Instant::now();
991+
992+
let t = Instant::now();
990993
channel_manager.get_cm().process_pending_events_async(async_event_handler).await;
994+
let cm_events_ms = t.elapsed().as_millis();
995+
996+
let t = Instant::now();
991997
chain_monitor.process_pending_events_async(async_event_handler).await;
998+
let chain_mon_events_ms = t.elapsed().as_millis();
999+
1000+
let t = Instant::now();
9921001
if let Some(om) = &onion_messenger {
9931002
om.get_om().process_pending_events_async(async_event_handler).await
9941003
}
1004+
let om_events_ms = t.elapsed().as_millis();
9951005

9961006
// Note that the PeerManager::process_events may block on ChannelManager's locks,
9971007
// hence it comes last here. When the ChannelManager finishes whatever it's doing,
@@ -1004,7 +1014,11 @@ where
10041014
// ChannelManager, we want to minimize methods blocking on a ChannelManager
10051015
// generally, and as a fallback place such blocking only immediately before
10061016
// persistence.
1017+
let t = Instant::now();
10071018
peer_manager.as_ref().process_events();
1019+
let peer_mgr_events_ms = t.elapsed().as_millis();
1020+
1021+
let t = Instant::now();
10081022
match check_and_reset_sleeper(&mut last_forwards_processing_call, || {
10091023
sleeper(batch_delay.next())
10101024
}) {
@@ -1014,6 +1028,7 @@ where
10141028
Some(true) => break,
10151029
None => {},
10161030
}
1031+
let htlc_fwd_ms = t.elapsed().as_millis();
10171032

10181033
// We wait up to 100ms, but track how long it takes to detect being put to sleep,
10191034
// see `await_start`'s use below.
@@ -1067,14 +1082,18 @@ where
10671082
} else {
10681083
false
10691084
};
1085+
let t = Instant::now();
1086+
let mut cm_tick_fired = false;
10701087
match check_and_reset_sleeper(&mut last_freshness_call, || sleeper(FRESHNESS_TIMER)) {
10711088
Some(false) => {
10721089
log_trace!(logger, "Calling ChannelManager's timer_tick_occurred");
10731090
channel_manager.get_cm().timer_tick_occurred();
1091+
cm_tick_fired = true;
10741092
},
10751093
Some(true) => break,
10761094
None => {},
10771095
}
1096+
let cm_tick_ms = t.elapsed().as_millis();
10781097

10791098
let mut futures = Joiner::new();
10801099

@@ -1132,9 +1151,12 @@ where
11321151
GossipSync::Rapid(_) => !have_pruned || prune_timer_elapsed,
11331152
_ => prune_timer_elapsed,
11341153
};
1154+
let t = Instant::now();
1155+
let mut prune_fired = false;
11351156
if should_prune {
11361157
// The network graph must not be pruned while rapid sync completion is pending
11371158
if let Some(network_graph) = gossip_sync.prunable_network_graph() {
1159+
prune_fired = true;
11381160
if let Some(duration_since_epoch) = fetch_time() {
11391161
log_trace!(logger, "Pruning and persisting network graph.");
11401162
network_graph.remove_stale_channels_and_tracking_with_time(
@@ -1166,6 +1188,8 @@ where
11661188
have_pruned = true;
11671189
}
11681190
}
1191+
let prune_ms = t.elapsed().as_millis();
1192+
11691193
if !have_decayed_scorer {
11701194
if let Some(ref scorer) = scorer {
11711195
if let Some(duration_since_epoch) = fetch_time() {
@@ -1247,6 +1271,7 @@ where
12471271
res?;
12481272
}
12491273

1274+
let t = Instant::now();
12501275
match check_and_reset_sleeper(&mut last_onion_message_handler_call, || {
12511276
sleeper(ONION_MESSAGE_HANDLER_TIMER)
12521277
}) {
@@ -1259,8 +1284,10 @@ where
12591284
Some(true) => break,
12601285
None => {},
12611286
}
1287+
let om_tick_ms = t.elapsed().as_millis();
12621288

12631289
// Peer manager timer tick. If we were interrupted on a mobile platform, we disconnect all peers.
1290+
let t = Instant::now();
12641291
if await_slow {
12651292
// On various platforms, we may be starved of CPU cycles for several reasons.
12661293
// E.g. on iOS, if we've been in the background, we will be entirely paused.
@@ -1288,7 +1315,10 @@ where
12881315
}
12891316
}
12901317

1318+
let peer_mgr_tick_ms = t.elapsed().as_millis();
1319+
12911320
// Rebroadcast pending claims.
1321+
let t = Instant::now();
12921322
match check_and_reset_sleeper(&mut last_rebroadcast_call, || sleeper(REBROADCAST_TIMER)) {
12931323
Some(false) => {
12941324
log_trace!(logger, "Rebroadcasting monitor's pending claims");
@@ -1297,6 +1327,24 @@ where
12971327
Some(true) => break,
12981328
None => {},
12991329
}
1330+
let rebroadcast_ms = t.elapsed().as_millis();
1331+
1332+
let loop_ms = loop_start.elapsed().as_millis();
1333+
if loop_ms > 500 {
1334+
log_warn!(logger,
1335+
"[bg-processor] loop took {}ms: \
1336+
cm_events={}ms chain_mon={}ms om_events={}ms \
1337+
peer_mgr={}ms htlc_fwd={}ms \
1338+
cm_tick={}ms{} prune={}ms{} \
1339+
om_tick={}ms peer_tick={}ms rebroadcast={}ms",
1340+
loop_ms,
1341+
cm_events_ms, chain_mon_events_ms, om_events_ms,
1342+
peer_mgr_events_ms, htlc_fwd_ms,
1343+
cm_tick_ms, if cm_tick_fired { "(fired)" } else { "" },
1344+
prune_ms, if prune_fired { "(fired)" } else { "" },
1345+
om_tick_ms, peer_mgr_tick_ms, rebroadcast_ms,
1346+
);
1347+
}
13001348
}
13011349
log_trace!(logger, "Terminating background processor.");
13021350

0 commit comments

Comments
 (0)