From e8cd7cdda2292c0953240fce2aada4178e2f6d0f Mon Sep 17 00:00:00 2001 From: Vitus Benson Date: Wed, 1 Apr 2026 20:33:22 +0200 Subject: [PATCH] Fix NODE_ID bug in SLURM process group initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _init_process_group_slurm() used _WorkerInfo.NODE_ID to index into tasks_per_node, but _WorkerInfo.NODE_ID is None at that point — it only gets set later by _initialize_via_tcp(). Use the local node_id variable (parsed from SLURM_NODEID) instead. --- dmlcloud/core/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlcloud/core/distributed.py b/dmlcloud/core/distributed.py index 2708a9e..925f59d 100644 --- a/dmlcloud/core/distributed.py +++ b/dmlcloud/core/distributed.py @@ -277,7 +277,7 @@ def _init_process_group_slurm(port=DEFAULT_PORT, **kwargs): tasks_per_node.extend([int(ntasks)] * int(nnodes[:-1])) else: tasks_per_node.append(int(t)) - local_world_size = tasks_per_node[_WorkerInfo.NODE_ID] + local_world_size = tasks_per_node[node_id] _initialize_via_tcp( ip=ip,