fix ec.balance failing to rebalance when all nodes share all volumes (#8796)

* fix ec.balance failing to rebalance when all nodes share all volumes (#8793)

Two bugs in doBalanceEcRack prevented rebalancing:

1. Sorting by freeEcSlot instead of actual shard count caused incorrect
   empty/full node selection when nodes have different total capacities.

2. The volume-level check skipped any volume already present on the
   target node. When every node has a shard of every volume (common
   with many EC volumes across N nodes with N shards each), no moves
   were possible.

Fix: sort by actual shard count, and use a two-pass approach - first
prefer moving shards of volumes not on the target (best diversity),
then fall back to moving specific shard IDs not yet on the target.

* add test simulating real cluster topology from issue #8793

Uses the actual node addresses and mixed max capacities (80 vs 33)
from the reporter's 14-node cluster to verify ec.balance correctly
rebalances with heterogeneous node sizes.

* fix pass comments to match 0-indexed loop variable
This commit is contained in:
Chris Lu
2026-03-27 11:14:10 -07:00
committed by GitHub
parent c2c58419b8
commit f256002d0b
2 changed files with 181 additions and 28 deletions

View File

@@ -1301,49 +1301,65 @@ func (ecb *ecBalancer) doBalanceEcRack(ecRack *EcRack) error {
hasMove := true
for hasMove {
hasMove = false
// Sort by shard count (ascending) so emptyNode has fewest shards, fullNode has most.
// Using freeEcSlot would be incorrect when nodes have different total capacities.
slices.SortFunc(rackEcNodes, func(a, b *EcNode) int {
return b.freeEcSlot - a.freeEcSlot
return ecNodeIdToShardCount[a.info.Id] - ecNodeIdToShardCount[b.info.Id]
})
emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1]
emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id]
if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount {
emptyNodeIds := make(map[uint32]bool)
// Build a map of volume ID -> shard bits on the empty node
emptyNodeVolumeShards := make(map[uint32]erasure_coding.ShardBits)
if emptyDiskInfo, found := emptyNode.info.DiskInfos[string(ecb.diskType)]; found {
for _, shards := range emptyDiskInfo.EcShardInfos {
emptyNodeIds[shards.Id] = true
emptyNodeVolumeShards[shards.Id] = erasure_coding.ShardBits(shards.EcIndexBits)
}
}
if fullDiskInfo, found := fullNode.info.DiskInfos[string(ecb.diskType)]; found {
for _, shards := range fullDiskInfo.EcShardInfos {
if _, found := emptyNodeIds[shards.Id]; found {
continue
}
si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(shards)
for _, shardId := range si.Ids() {
vid := needle.VolumeId(shards.Id)
// For balancing, strictly require matching disk type
// For balancing, strictly require matching disk type and apply anti-affinity
dataShardCount := ecb.getDataShardCount()
destDiskId := pickBestDiskOnNode(emptyNode, vid, ecb.diskType, true, shardId, dataShardCount)
if destDiskId > 0 {
fmt.Printf("%s moves ec shards %d.%d to %s (disk %d)\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id, destDiskId)
} else {
fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
// Pass 0: prefer moving shards of volumes not on the empty node (best diversity)
// Pass 1: move shards of shared volumes where the specific shard ID differs
for pass := 0; pass < 2 && !hasMove; pass++ {
for _, shards := range fullDiskInfo.EcShardInfos {
emptyBits, volumeOnEmpty := emptyNodeVolumeShards[shards.Id]
if pass == 0 && volumeOnEmpty {
continue // pass 0: skip volumes already on the empty node
}
err := moveMountedShardToEcNode(ecb.commandEnv, fullNode, shards.Collection, vid, shardId, emptyNode, destDiskId, ecb.applyBalancing, ecb.diskType)
if err != nil {
return err
if pass == 1 && !volumeOnEmpty {
continue // pass 1: only consider volumes already on the empty node
}
si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(shards)
for _, shardId := range si.Ids() {
if pass == 1 && emptyBits.Has(shardId) {
continue // Skip shard IDs already on the empty node
}
vid := needle.VolumeId(shards.Id)
// For balancing, strictly require matching disk type and apply anti-affinity
dataShardCount := ecb.getDataShardCount()
destDiskId := pickBestDiskOnNode(emptyNode, vid, ecb.diskType, true, shardId, dataShardCount)
ecNodeIdToShardCount[emptyNode.info.Id]++
ecNodeIdToShardCount[fullNode.info.Id]--
hasMove = true
break
if destDiskId > 0 {
fmt.Printf("%s moves ec shards %d.%d to %s (disk %d)\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id, destDiskId)
} else {
fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id)
}
err := moveMountedShardToEcNode(ecb.commandEnv, fullNode, shards.Collection, vid, shardId, emptyNode, destDiskId, ecb.applyBalancing, ecb.diskType)
if err != nil {
return err
}
ecNodeIdToShardCount[emptyNode.info.Id]++
ecNodeIdToShardCount[fullNode.info.Id]--
hasMove = true
break
}
if hasMove {
break
}
}
break
}
}
}