Fix reporting of EC shard sizes from nodes to masters. (#7835)
SeaweedFS tracks EC shard sizes on topology data stuctures, but this information is never
relayed to master servers :( The end result is that commands reporting disk usage, such
as `volume.list` and `cluster.status`, yield incorrect figures when EC shards are present.
As an example for a simple 5-node test cluster, before...
```
> volume.list
Topology volumeSizeLimit:30000 MB hdd(volume:6/40 active:6 free:33 remote:0)
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9001 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:3 size:88967096 file_count:172 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[1 5]
Disk hdd total size:88967096 file_count:172
DataNode 192.168.10.111:9001 total size:88967096 file_count:172
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9002 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:77267536 file_count:166 replica_placement:2 version:3 modified_at_second:1766349617
volume id:3 size:88967096 file_count:172 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[0 4]
Disk hdd total size:166234632 file_count:338
DataNode 192.168.10.111:9002 total size:166234632 file_count:338
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9003 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:2 size:77267536 file_count:166 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[2 6]
Disk hdd total size:77267536 file_count:166
DataNode 192.168.10.111:9003 total size:77267536 file_count:166
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9004 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:77267536 file_count:166 replica_placement:2 version:3 modified_at_second:1766349617
volume id:3 size:88967096 file_count:172 replica_placement:2 version:3 modified_at_second:1766349617
ec volume id:1 collection: shards:[3 7]
Disk hdd total size:166234632 file_count:338
DataNode 192.168.10.111:9004 total size:166234632 file_count:338
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9005 hdd(volume:0/8 active:0 free:8 remote:0)
Disk hdd(volume:0/8 active:0 free:8 remote:0) id:0
ec volume id:1 collection: shards:[8 9 10 11 12 13]
Disk hdd total size:0 file_count:0
Rack DefaultRack total size:498703896 file_count:1014
DataCenter DefaultDataCenter total size:498703896 file_count:1014
total size:498703896 file_count:1014
```
...and after:
```
> volume.list
Topology volumeSizeLimit:30000 MB hdd(volume:6/40 active:6 free:33 remote:0)
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9001 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:2 size:81761800 file_count:161 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[1 5 9] sizes:[1:8.00 MiB 5:8.00 MiB 9:8.00 MiB] total:24.00 MiB
Disk hdd total size:81761800 file_count:161
DataNode 192.168.10.111:9001 total size:81761800 file_count:161
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9002 hdd(volume:1/8 active:1 free:7 remote:0)
Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
volume id:3 size:88678712 file_count:170 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[11 12 13] sizes:[11:8.00 MiB 12:8.00 MiB 13:8.00 MiB] total:24.00 MiB
Disk hdd total size:88678712 file_count:170
DataNode 192.168.10.111:9002 total size:88678712 file_count:170
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9003 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:81761800 file_count:161 replica_placement:2 version:3 modified_at_second:1766349495
volume id:3 size:88678712 file_count:170 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[0 4 8] sizes:[0:8.00 MiB 4:8.00 MiB 8:8.00 MiB] total:24.00 MiB
Disk hdd total size:170440512 file_count:331
DataNode 192.168.10.111:9003 total size:170440512 file_count:331
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9004 hdd(volume:2/8 active:2 free:6 remote:0)
Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
volume id:2 size:81761800 file_count:161 replica_placement:2 version:3 modified_at_second:1766349495
volume id:3 size:88678712 file_count:170 replica_placement:2 version:3 modified_at_second:1766349495
ec volume id:1 collection: shards:[2 6 10] sizes:[2:8.00 MiB 6:8.00 MiB 10:8.00 MiB] total:24.00 MiB
Disk hdd total size:170440512 file_count:331
DataNode 192.168.10.111:9004 total size:170440512 file_count:331
DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
DataNode 192.168.10.111:9005 hdd(volume:0/8 active:0 free:8 remote:0)
Disk hdd(volume:0/8 active:0 free:8 remote:0) id:0
ec volume id:1 collection: shards:[3 7] sizes:[3:8.00 MiB 7:8.00 MiB] total:16.00 MiB
Disk hdd total size:0 file_count:0
Rack DefaultRack total size:511321536 file_count:993
DataCenter DefaultDataCenter total size:511321536 file_count:993
total size:511321536 file_count:993
```
This commit is contained in:
@@ -149,8 +149,7 @@ func (erb *ecRebuilder) countLocalShards(node *EcNode, collection string, volume
|
||||
for _, diskInfo := range node.info.DiskInfos {
|
||||
for _, ecShardInfo := range diskInfo.EcShardInfos {
|
||||
if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
|
||||
shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
|
||||
return len(shardBits.ShardIds())
|
||||
return erasure_coding.ShardsCountFromVolumeEcShardInformationMessage(ecShardInfo)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -266,7 +265,7 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
|
||||
fmt.Printf("rebuildOneEcVolume %s %d\n", collection, volumeId)
|
||||
|
||||
// collect shard files to rebuilder local disk
|
||||
var generatedShardIds []uint32
|
||||
var generatedShardIds []erasure_coding.ShardId
|
||||
copiedShardIds, _, err := erb.prepareDataToRecover(rebuilder, collection, volumeId, locations)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -306,7 +305,7 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
|
||||
return nil
|
||||
}
|
||||
|
||||
func (erb *ecRebuilder) generateMissingShards(collection string, volumeId needle.VolumeId, sourceLocation pb.ServerAddress) (rebuiltShardIds []uint32, err error) {
|
||||
func (erb *ecRebuilder) generateMissingShards(collection string, volumeId needle.VolumeId, sourceLocation pb.ServerAddress) (rebuiltShardIds []erasure_coding.ShardId, err error) {
|
||||
|
||||
err = operation.WithVolumeServerClient(false, sourceLocation, erb.commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
|
||||
resp, rebuildErr := volumeServerClient.VolumeEcShardsRebuild(context.Background(), &volume_server_pb.VolumeEcShardsRebuildRequest{
|
||||
@@ -314,34 +313,35 @@ func (erb *ecRebuilder) generateMissingShards(collection string, volumeId needle
|
||||
Collection: collection,
|
||||
})
|
||||
if rebuildErr == nil {
|
||||
rebuiltShardIds = resp.RebuiltShardIds
|
||||
rebuiltShardIds = erasure_coding.Uint32ToShardIds(resp.RebuiltShardIds)
|
||||
}
|
||||
return rebuildErr
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection string, volumeId needle.VolumeId, locations EcShardLocations) (copiedShardIds []uint32, localShardIds []uint32, err error) {
|
||||
func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection string, volumeId needle.VolumeId, locations EcShardLocations) (copiedShardIds []erasure_coding.ShardId, localShardIds []erasure_coding.ShardId, err error) {
|
||||
|
||||
needEcxFile := true
|
||||
var localShardBits erasure_coding.ShardBits
|
||||
localShardsInfo := erasure_coding.NewShardsInfo()
|
||||
for _, diskInfo := range rebuilder.info.DiskInfos {
|
||||
for _, ecShardInfo := range diskInfo.EcShardInfos {
|
||||
if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
|
||||
needEcxFile = false
|
||||
localShardBits = erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
|
||||
localShardsInfo = erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for shardId, ecNodes := range locations {
|
||||
for i, ecNodes := range locations {
|
||||
shardId := erasure_coding.ShardId(i)
|
||||
if len(ecNodes) == 0 {
|
||||
erb.write("missing shard %d.%d\n", volumeId, shardId)
|
||||
continue
|
||||
}
|
||||
|
||||
if localShardBits.HasShardId(erasure_coding.ShardId(shardId)) {
|
||||
localShardIds = append(localShardIds, uint32(shardId))
|
||||
if localShardsInfo.Has(shardId) {
|
||||
localShardIds = append(localShardIds, shardId)
|
||||
erb.write("use existing shard %d.%d\n", volumeId, shardId)
|
||||
continue
|
||||
}
|
||||
@@ -368,7 +368,7 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
|
||||
erb.write("%s failed to copy %d.%d from %s: %v\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id, copyErr)
|
||||
} else {
|
||||
erb.write("%s copied %d.%d from %s\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id)
|
||||
copiedShardIds = append(copiedShardIds, uint32(shardId))
|
||||
copiedShardIds = append(copiedShardIds, shardId)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -394,7 +394,7 @@ func (ecShardMap EcShardMap) registerEcNode(ecNode *EcNode, collection string) {
|
||||
existing = make([][]*EcNode, erasure_coding.MaxShardCount)
|
||||
ecShardMap[needle.VolumeId(shardInfo.Id)] = existing
|
||||
}
|
||||
for _, shardId := range erasure_coding.ShardBits(shardInfo.EcIndexBits).ShardIds() {
|
||||
for _, shardId := range erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(shardInfo).Ids() {
|
||||
existing[shardId] = append(existing[shardId], ecNode)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user