From 8d2bc7fa60d2781903b8953877e22c747739ca2a Mon Sep 17 00:00:00 2001 From: Hersh Godse Date: Fri, 29 May 2026 01:54:39 +0000 Subject: [PATCH] megatron_worker: split LoRA broadcast barrier to fix multi-engine hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In _save_lora_adapters_and_sync, the torch.distributed.barrier() was AFTER rank 0's HTTP load_lora_adapter fan-out, which on multi-engine configs (num_engines=4) sends to all 4 backend URLs in parallel — each loading a 35B-LoRA from Weka takes seconds. Cumulative HTTP time can exceed gloo's short "Application timeout" (~10s by default), which closes the gloo pair on ranks 1-3 with "Application timeout caused pair closure". Fix: barrier AFTER rank-0 disk write (fast) but BEFORE the HTTP fan-out. Ranks 1-3 unblock immediately and can do other work (or hit the outer barrier in broadcast_to_inference_engines, which times out under heavy load but at least doesn't double-block the inner step). Reproed on Qwen3.6-35B-A3B + tau-retail with our pinned base 36f38c7 + Sumanth's #1720 cherry-pick. Single trainer + 4 vLLM engines: multi-engine routing works (per-engine /v1/completions balanced) but broadcast at first training step hangs in this barrier. Will follow up with: (a) instrument fan-out duration so we know if the outer barrier is also at risk, (b) make HTTP fan-out concurrent across engines (it should already be via _load_on_server asyncio.gather, verify). --- .../workers/megatron/megatron_worker.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py b/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py index 58061e9c3c..d237bd64d8 100644 --- a/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py +++ b/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py @@ -999,7 +999,17 @@ async def _save_lora_adapters_and_sync( with open(os.path.join(lora_sync_path, "adapter_config.json"), "w", encoding="utf-8") as f: json.dump(adapter_config, f, ensure_ascii=False, indent=4) - # Send LoRA disk loading request to inference engine. + # Sync after rank-0 disk write so all ranks see consistent state on + # Weka. Critically, this barrier is BEFORE the rank-0 HTTP fan-out to + # inference engines — fan-out to 4 engines on multi-engine configs can + # take >gloo-timeout (~10s) which would hang ranks 1-3 at the barrier. + torch.distributed.barrier() + + # Rank-0 HTTP fan-out happens AFTER the barrier, so ranks 1-3 don't + # block on its duration. The caller's outer barrier + # (broadcast_to_inference_engines, line ~1029) still gates the next + # training phase on the fan-out completing. + if torch.distributed.get_rank() == 0: from skyrl.backends.skyrl_train.inference_servers.remote_inference_client import ( RemoteInferenceClient, ) @@ -1010,8 +1020,6 @@ async def _save_lora_adapters_and_sync( lora_request = LoraLoadRequest(lora_path=lora_sync_path, lora_name=lora_name) await inference_engine_client.update_named_weights(lora_request) - torch.distributed.barrier() - async def broadcast_to_inference_engines( self, inference_engine_client: "InferenceEngineInterface",