SeaweedFS tracks EC shard sizes on topology data stuctures, but this information is never
relayed to master servers :( The end result is that commands reporting disk usage, such
as `volume.list` and `cluster.status`, yield incorrect figures when EC shards are present.
As an example for a simple 5-node test cluster, before...
```
> volume.list
Topology volumeSizeLimit:30000 MB hdd(volume:6/40 active:6 free:33 remote:0)
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9001 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:3 size:88967096 file_count:172 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[1 5]
Disk hdd total size:88967096 file_count:172
DataNode 192.168.10.111:9001 total size:88967096 file_count:172
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9002 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:77267536 file_count:166 replica_placement:2 version:3 modified_at_second:1766349617
volume id:3 size:88967096 file_count:172 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[0 4]
Disk hdd total size:166234632 file_count:338
DataNode 192.168.10.111:9002 total size:166234632 file_count:338
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9003 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:2 size:77267536 file_count:166 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[2 6]
Disk hdd total size:77267536 file_count:166
DataNode 192.168.10.111:9003 total size:77267536 file_count:166
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9004 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:77267536 file_count:166 replica_placement:2 version:3 modified_at_second:1766349617
volume id:3 size:88967096 file_count:172 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[3 7]
Disk hdd total size:166234632 file_count:338
DataNode 192.168.10.111:9004 total size:166234632 file_count:338
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9005 hdd(volume:0/8 active:0 free:8 remote:0)
Disk hdd(volume:0/8 active:0 free:8 remote:0) id:0
ec volume id:1 collection: shards:[8 9 10 11 12 13]
Disk hdd total size:0 file_count:0
Rack DefaultRack total size:498703896 file_count:1014
DataCenter DefaultDataCenter total size:498703896 file_count:1014
total size:498703896 file_count:1014
```
...and after:
```
> volume.list
Topology volumeSizeLimit:30000 MB hdd(volume:6/40 active:6 free:33 remote:0)
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9001 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:2 size:81761800 file_count:161 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[1 5 9] sizes:[1:8.00 MiB 5:8.00 MiB 9:8.00 MiB] total:24.00 MiB
Disk hdd total size:81761800 file_count:161
DataNode 192.168.10.111:9001 total size:81761800 file_count:161
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9002 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:3 size:88678712 file_count:170 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[11 12 13] sizes:[11:8.00 MiB 12:8.00 MiB 13:8.00 MiB] total:24.00 MiB
Disk hdd total size:88678712 file_count:170
DataNode 192.168.10.111:9002 total size:88678712 file_count:170
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9003 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:81761800 file_count:161 replica_placement:2 version:3 modified_at_second:1766349495
volume id:3 size:88678712 file_count:170 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[0 4 8] sizes:[0:8.00 MiB 4:8.00 MiB 8:8.00 MiB] total:24.00 MiB
Disk hdd total size:170440512 file_count:331
DataNode 192.168.10.111:9003 total size:170440512 file_count:331
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9004 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:81761800 file_count:161 replica_placement:2 version:3 modified_at_second:1766349495
volume id:3 size:88678712 file_count:170 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[2 6 10] sizes:[2:8.00 MiB 6:8.00 MiB 10:8.00 MiB] total:24.00 MiB
Disk hdd total size:170440512 file_count:331
DataNode 192.168.10.111:9004 total size:170440512 file_count:331
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9005 hdd(volume:0/8 active:0 free:8 remote:0)
Disk hdd(volume:0/8 active:0 free:8 remote:0) id:0
ec volume id:1 collection: shards:[3 7] sizes:[3:8.00 MiB 7:8.00 MiB] total:16.00 MiB
Disk hdd total size:0 file_count:0
Rack DefaultRack total size:511321536 file_count:993
DataCenter DefaultDataCenter total size:511321536 file_count:993
total size:511321536 file_count:993
```
295 lines
10 KiB
Go
295 lines
10 KiB
Go
package shell
|
||
|
||
import (
|
||
"testing"
|
||
|
||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
||
)
|
||
|
||
// TestECRebalanceWithLimitedSlots tests that EC rebalance handles the scenario
|
||
// where there are limited free slots on volume servers.
|
||
//
|
||
// This is a regression test for the error:
|
||
//
|
||
// "no free ec shard slots. only 0 left"
|
||
//
|
||
// Scenario (from real usage):
|
||
// - 6 volume servers in 6 racks
|
||
// - Each server has max=10 volume slots
|
||
// - 7 volumes were EC encoded (7 × 14 = 98 EC shards)
|
||
// - All 14 shards per volume are on the original server (not yet distributed)
|
||
//
|
||
// Expected behavior:
|
||
// - The rebalance algorithm should distribute shards across servers
|
||
// - Even if perfect distribution isn't possible, it should do best-effort
|
||
// - Currently fails with "no free ec shard slots" because freeSlots calculation
|
||
//
|
||
// doesn't account for shards being moved (freed slots on source, used on target)
|
||
func TestECRebalanceWithLimitedSlots(t *testing.T) {
|
||
// Build a topology matching the problematic scenario:
|
||
// 6 servers, each with 2+ volumes worth of EC shards (all 14 shards per volume on same server)
|
||
topology := buildLimitedSlotsTopology()
|
||
|
||
// Collect EC nodes from the topology
|
||
ecNodes, totalFreeEcSlots := collectEcVolumeServersByDc(topology, "", types.HardDriveType)
|
||
|
||
t.Logf("Topology summary:")
|
||
t.Logf(" Number of EC nodes: %d", len(ecNodes))
|
||
t.Logf(" Total free EC slots: %d", totalFreeEcSlots)
|
||
|
||
// Log per-node details
|
||
for _, node := range ecNodes {
|
||
shardCount := 0
|
||
for _, diskInfo := range node.info.DiskInfos {
|
||
for _, ecShard := range diskInfo.EcShardInfos {
|
||
shardCount += erasure_coding.ShardsCountFromVolumeEcShardInformationMessage(ecShard)
|
||
}
|
||
}
|
||
t.Logf(" Node %s (rack %s): %d shards, %d free slots",
|
||
node.info.Id, node.rack, shardCount, node.freeEcSlot)
|
||
}
|
||
|
||
// Calculate total EC shards
|
||
totalEcShards := 0
|
||
for _, node := range ecNodes {
|
||
for _, diskInfo := range node.info.DiskInfos {
|
||
for _, ecShard := range diskInfo.EcShardInfos {
|
||
totalEcShards += erasure_coding.ShardsCountFromVolumeEcShardInformationMessage(ecShard)
|
||
}
|
||
}
|
||
}
|
||
t.Logf(" Total EC shards: %d", totalEcShards)
|
||
|
||
// Document the issue:
|
||
// With 98 EC shards (7 volumes × 14 shards) on 6 servers with max=10 each,
|
||
// total capacity is 60 slots. But shards already occupy slots on their current servers.
|
||
//
|
||
// The current algorithm calculates free slots as:
|
||
// freeSlots = maxVolumeCount - volumeCount - ecShardCount
|
||
//
|
||
// If all shards are on their original servers:
|
||
// - Server A has 28 shards (2 volumes × 14) → may have negative free slots
|
||
// - This causes totalFreeEcSlots to be 0 or negative
|
||
//
|
||
// The EXPECTED improvement:
|
||
// - Rebalance should recognize that moving a shard FREES a slot on the source
|
||
// - The algorithm should work iteratively, moving shards one at a time
|
||
// - Even if starting with 0 free slots, moving one shard opens a slot
|
||
|
||
if totalFreeEcSlots < 1 {
|
||
// This is the current (buggy) behavior we're documenting
|
||
t.Logf("")
|
||
t.Logf("KNOWN ISSUE: totalFreeEcSlots = %d (< 1)", totalFreeEcSlots)
|
||
t.Logf("")
|
||
t.Logf("This triggers the error: 'no free ec shard slots. only %d left'", totalFreeEcSlots)
|
||
t.Logf("")
|
||
t.Logf("Analysis:")
|
||
t.Logf(" - %d EC shards across %d servers", totalEcShards, len(ecNodes))
|
||
t.Logf(" - Shards are concentrated on original servers (not distributed)")
|
||
t.Logf(" - Current slot calculation doesn't account for slots freed by moving shards")
|
||
t.Logf("")
|
||
t.Logf("Expected fix:")
|
||
t.Logf(" 1. Rebalance should work iteratively, moving one shard at a time")
|
||
t.Logf(" 2. Moving a shard from A to B: frees 1 slot on A, uses 1 slot on B")
|
||
t.Logf(" 3. The 'free slots' check should be per-move, not global")
|
||
t.Logf(" 4. Or: calculate 'redistributable slots' = total capacity - shards that must stay")
|
||
|
||
// For now, document this is a known issue - don't fail the test
|
||
// When the fix is implemented, this test should be updated to verify the fix works
|
||
return
|
||
}
|
||
|
||
// If we get here, the issue might have been fixed
|
||
t.Logf("totalFreeEcSlots = %d, rebalance should be possible", totalFreeEcSlots)
|
||
}
|
||
|
||
// TestECRebalanceZeroFreeSlots tests the specific scenario where
|
||
// the topology appears to have free slots but rebalance fails.
|
||
//
|
||
// This can happen when the VolumeCount in the topology includes the original
|
||
// volumes that were EC-encoded, making the free slot calculation incorrect.
|
||
func TestECRebalanceZeroFreeSlots(t *testing.T) {
|
||
// Build a topology where volumes were NOT deleted after EC encoding
|
||
// (VolumeCount still reflects the original volumes)
|
||
topology := buildZeroFreeSlotTopology()
|
||
|
||
ecNodes, totalFreeEcSlots := collectEcVolumeServersByDc(topology, "", types.HardDriveType)
|
||
|
||
t.Logf("Zero free slots scenario:")
|
||
for _, node := range ecNodes {
|
||
shardCount := 0
|
||
for _, diskInfo := range node.info.DiskInfos {
|
||
for _, ecShard := range diskInfo.EcShardInfos {
|
||
shardCount += erasure_coding.ShardsCountFromVolumeEcShardInformationMessage(ecShard)
|
||
}
|
||
}
|
||
t.Logf(" Node %s: %d shards, %d free slots, volumeCount=%d, max=%d",
|
||
node.info.Id, shardCount, node.freeEcSlot,
|
||
node.info.DiskInfos[string(types.HardDriveType)].VolumeCount,
|
||
node.info.DiskInfos[string(types.HardDriveType)].MaxVolumeCount)
|
||
}
|
||
t.Logf(" Total free slots: %d", totalFreeEcSlots)
|
||
|
||
if totalFreeEcSlots == 0 {
|
||
t.Logf("")
|
||
t.Logf("SCENARIO REPRODUCED: totalFreeEcSlots = 0")
|
||
t.Logf("This would trigger: 'no free ec shard slots. only 0 left'")
|
||
}
|
||
}
|
||
|
||
// buildZeroFreeSlotTopology creates a topology where rebalance will fail
|
||
// because servers are at capacity (volumeCount equals maxVolumeCount)
|
||
func buildZeroFreeSlotTopology() *master_pb.TopologyInfo {
|
||
diskTypeKey := string(types.HardDriveType)
|
||
|
||
// Each server has max=10, volumeCount=10 (full capacity)
|
||
// Free capacity = (10-10) * 10 = 0 per server
|
||
// This will trigger "no free ec shard slots" error
|
||
return &master_pb.TopologyInfo{
|
||
Id: "test_zero_free_slots",
|
||
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||
{
|
||
Id: "dc1",
|
||
RackInfos: []*master_pb.RackInfo{
|
||
{
|
||
Id: "rack0",
|
||
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||
{
|
||
Id: "127.0.0.1:8080",
|
||
DiskInfos: map[string]*master_pb.DiskInfo{
|
||
diskTypeKey: {
|
||
Type: diskTypeKey,
|
||
MaxVolumeCount: 10,
|
||
VolumeCount: 10, // At full capacity
|
||
EcShardInfos: buildEcShards([]uint32{3, 4}),
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
{
|
||
Id: "rack1",
|
||
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||
{
|
||
Id: "127.0.0.1:8081",
|
||
DiskInfos: map[string]*master_pb.DiskInfo{
|
||
diskTypeKey: {
|
||
Type: diskTypeKey,
|
||
MaxVolumeCount: 10,
|
||
VolumeCount: 10,
|
||
EcShardInfos: buildEcShards([]uint32{1, 7}),
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
{
|
||
Id: "rack2",
|
||
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||
{
|
||
Id: "127.0.0.1:8082",
|
||
DiskInfos: map[string]*master_pb.DiskInfo{
|
||
diskTypeKey: {
|
||
Type: diskTypeKey,
|
||
MaxVolumeCount: 10,
|
||
VolumeCount: 10,
|
||
EcShardInfos: buildEcShards([]uint32{2}),
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
{
|
||
Id: "rack3",
|
||
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||
{
|
||
Id: "127.0.0.1:8083",
|
||
DiskInfos: map[string]*master_pb.DiskInfo{
|
||
diskTypeKey: {
|
||
Type: diskTypeKey,
|
||
MaxVolumeCount: 10,
|
||
VolumeCount: 10,
|
||
EcShardInfos: buildEcShards([]uint32{5, 6}),
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
},
|
||
}
|
||
}
|
||
|
||
func buildEcShards(volumeIds []uint32) []*master_pb.VolumeEcShardInformationMessage {
|
||
var shards []*master_pb.VolumeEcShardInformationMessage
|
||
for _, vid := range volumeIds {
|
||
si := erasure_coding.NewShardsInfo()
|
||
for _, id := range erasure_coding.AllShardIds() {
|
||
si.Set(id, 1234)
|
||
}
|
||
shards = append(shards, &master_pb.VolumeEcShardInformationMessage{
|
||
Id: vid,
|
||
Collection: "ectest",
|
||
EcIndexBits: si.Bitmap(),
|
||
ShardSizes: si.SizesInt64(),
|
||
})
|
||
}
|
||
return shards
|
||
}
|
||
|
||
// buildLimitedSlotsTopology creates a topology matching the problematic scenario:
|
||
// - 6 servers in 6 racks
|
||
// - Each server has max=10 volume slots
|
||
// - 7 volumes were EC encoded, shards distributed as follows:
|
||
// - rack0 (8080): volumes 3,4 → 28 shards
|
||
// - rack1 (8081): volumes 1,7 → 28 shards
|
||
// - rack2 (8082): volume 2 → 14 shards
|
||
// - rack3 (8083): volumes 5,6 → 28 shards
|
||
// - rack4 (8084): (no volumes originally)
|
||
// - rack5 (8085): (no volumes originally)
|
||
func buildLimitedSlotsTopology() *master_pb.TopologyInfo {
|
||
return &master_pb.TopologyInfo{
|
||
Id: "test_limited_slots",
|
||
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||
{
|
||
Id: "dc1",
|
||
RackInfos: []*master_pb.RackInfo{
|
||
buildRackWithEcShards("rack0", "127.0.0.1:8080", 10, []uint32{3, 4}),
|
||
buildRackWithEcShards("rack1", "127.0.0.1:8081", 10, []uint32{1, 7}),
|
||
buildRackWithEcShards("rack2", "127.0.0.1:8082", 10, []uint32{2}),
|
||
buildRackWithEcShards("rack3", "127.0.0.1:8083", 10, []uint32{5, 6}),
|
||
buildRackWithEcShards("rack4", "127.0.0.1:8084", 10, []uint32{}),
|
||
buildRackWithEcShards("rack5", "127.0.0.1:8085", 10, []uint32{}),
|
||
},
|
||
},
|
||
},
|
||
}
|
||
}
|
||
|
||
// buildRackWithEcShards creates a rack with one data node containing EC shards
|
||
// for the specified volume IDs (all 14 shards per volume)
|
||
func buildRackWithEcShards(rackId, nodeId string, maxVolumes int64, volumeIds []uint32) *master_pb.RackInfo {
|
||
// Note: types.HardDriveType is "" (empty string), so we use "" as the key
|
||
diskTypeKey := string(types.HardDriveType)
|
||
|
||
return &master_pb.RackInfo{
|
||
Id: rackId,
|
||
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||
{
|
||
Id: nodeId,
|
||
DiskInfos: map[string]*master_pb.DiskInfo{
|
||
diskTypeKey: {
|
||
Type: diskTypeKey,
|
||
MaxVolumeCount: maxVolumes,
|
||
VolumeCount: int64(len(volumeIds)), // Original volumes still counted
|
||
EcShardInfos: buildEcShards(volumeIds),
|
||
},
|
||
},
|
||
},
|
||
},
|
||
}
|
||
}
|