Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,13 @@ protected int getRequiredRackCount(int numReplicas, int excludedRackCount) {
* @return The max number of replicas per rack
*/
protected int getMaxReplicasPerRack(int numReplicas, int numberOfRacks) {
if (numberOfRacks <= 0) {
// No rack information means there is no per-rack constraint to
// enforce. Callers are expected to short-circuit before reaching
// here, but guard the divide site against transient empty-topology
// windows (HDDS-15350).
return numReplicas;
}
return numReplicas / numberOfRacks
+ Math.min(numReplicas % numberOfRacks, 1);
}
Expand All @@ -436,7 +443,16 @@ public ContainerPlacementStatus validateContainerPlacement(
NetworkTopology topology = nodeManager.getClusterNetworkTopologyMap();
// We have a network topology so calculate if it is satisfied or not.
int requiredRacks = getRequiredRackCount(replicas, 0);
if (topology == null || replicas == 1 || requiredRacks == 1) {
// The leaf nodes are all at max level, so the number of nodes at
// maxLevel - 1 is the rack count. Compute up front so we can
// short-circuit when the topology has no rack information, which
// would otherwise reach getMaxReplicasPerRack with numberOfRacks
// == 0 (HDDS-15350: transient empty-topology window during a DN
// decommission crashed the ReplicationMonitor with "/ by zero").
final int numRacks = topology == null ? 0
: topology.getNumOfNodes(topology.getMaxLevel() - 1);
if (topology == null || replicas == 1 || requiredRacks <= 1
|| numRacks <= 0) {
if (!dns.isEmpty()) {
// placement is always satisfied if there is at least one DN.
return validPlacement;
Expand Down Expand Up @@ -471,10 +487,6 @@ public ContainerPlacementStatus validateContainerPlacement(
Function.identity(),
Collectors.reducing(0, e -> 1, Integer::sum)))
.values());
final int maxLevel = topology.getMaxLevel();
// The leaf nodes are all at max level, so the number of nodes at
// leafLevel - 1 is the rack count
int numRacks = topology.getNumOfNodes(maxLevel - 1);
if (replicas < requiredRacks) {
requiredRacks = replicas;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,46 @@ public void testValidatePlacementWithDeadMaintenanceNode() throws NodeNotFoundEx
assertTrue(placementStatus.isPolicySatisfied());
}

/**
* HDDS-15350: when the network topology transiently reports zero racks
* (observed during a DN decommission), validateContainerPlacement must
* not crash with ArithmeticException ("/ by zero") in
* getMaxReplicasPerRack. Without the fix this test throws and SCM's
* ReplicationMonitor thread dies along with it.
*/
@Test
public void testValidateContainerPlacementWithZeroRackTopology() {
List<DatanodeDetails> nodes = ImmutableList.of(
MockDatanodeDetails.randomDatanodeDetails(),
MockDatanodeDetails.randomDatanodeDetails(),
MockDatanodeDetails.randomDatanodeDetails());
NodeManager mockNodeManager = mock(NodeManager.class);
when(mockNodeManager.getAllNodes()).thenAnswer(inv -> nodes);

// Topology that reports zero racks at the rack level - the empty
// -topology window observed during DN decommission.
NetworkTopology topology = mock(NetworkTopology.class);
when(topology.getMaxLevel()).thenReturn(3);
when(topology.getNumOfNodes(anyInt())).thenReturn(0);
when(mockNodeManager.getClusterNetworkTopologyMap()).thenReturn(topology);

// rackCnt=2 makes DummyPlacementPolicy.getRequiredRackCount return
// min(replicas, 2) > 1, so the original early-return guard does NOT
// fire and execution proceeds to the divide site.
Map<Integer, Integer> rackMap = new HashMap<>();
rackMap.put(0, 0);
rackMap.put(1, 0);
rackMap.put(2, 0);
DummyPlacementPolicy policy = new DummyPlacementPolicy(
mockNodeManager, conf, rackMap, 2);

ContainerPlacementStatus status =
policy.validateContainerPlacement(nodes, 3);
assertTrue(status.isPolicySatisfied(),
"placement should not crash and should be considered satisfied "
+ "when the topology reports no racks");
}

private static class DummyPlacementPolicy extends SCMCommonPlacementPolicy {
private Map<DatanodeDetails, Node> rackMap;
private List<Node> racks;
Expand Down