Files
seaweedFS/weed/storage/erasure_coding/ec_volume.go
Lisandro Pin 6b98b52acc Fix reporting of EC shard sizes from nodes to masters. (#7835)
SeaweedFS tracks EC shard sizes on topology data stuctures, but this information is never
relayed to master servers :( The end result is that commands reporting disk usage, such
as `volume.list` and `cluster.status`, yield incorrect figures when EC shards are present.

As an example for a simple 5-node test cluster, before...

```
> volume.list
Topology volumeSizeLimit:30000 MB hdd(volume:6/40 active:6 free:33 remote:0)
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9001 hdd(volume:1/8 active:1 free:7 remote:0)
        Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
          volume id:3  size:88967096  file_count:172  replica_placement:2  version:3  modified_at_second:1766349617
          ec volume id:1 collection: shards:[1 5]
        Disk hdd total size:88967096 file_count:172
      DataNode 192.168.10.111:9001 total size:88967096 file_count:172
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9002 hdd(volume:2/8 active:2 free:6 remote:0)
        Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
          volume id:2  size:77267536  file_count:166  replica_placement:2  version:3  modified_at_second:1766349617
          volume id:3  size:88967096  file_count:172  replica_placement:2  version:3  modified_at_second:1766349617
          ec volume id:1 collection: shards:[0 4]
        Disk hdd total size:166234632 file_count:338
      DataNode 192.168.10.111:9002 total size:166234632 file_count:338
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9003 hdd(volume:1/8 active:1 free:7 remote:0)
        Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
          volume id:2  size:77267536  file_count:166  replica_placement:2  version:3  modified_at_second:1766349617
          ec volume id:1 collection: shards:[2 6]
        Disk hdd total size:77267536 file_count:166
      DataNode 192.168.10.111:9003 total size:77267536 file_count:166
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9004 hdd(volume:2/8 active:2 free:6 remote:0)
        Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
          volume id:2  size:77267536  file_count:166  replica_placement:2  version:3  modified_at_second:1766349617
          volume id:3  size:88967096  file_count:172  replica_placement:2  version:3  modified_at_second:1766349617
          ec volume id:1 collection: shards:[3 7]
        Disk hdd total size:166234632 file_count:338
      DataNode 192.168.10.111:9004 total size:166234632 file_count:338
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9005 hdd(volume:0/8 active:0 free:8 remote:0)
        Disk hdd(volume:0/8 active:0 free:8 remote:0) id:0
          ec volume id:1 collection: shards:[8 9 10 11 12 13]
        Disk hdd total size:0 file_count:0
    Rack DefaultRack total size:498703896 file_count:1014
  DataCenter DefaultDataCenter total size:498703896 file_count:1014
total size:498703896 file_count:1014
```

...and after:

```
> volume.list
Topology volumeSizeLimit:30000 MB hdd(volume:6/40 active:6 free:33 remote:0)
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9001 hdd(volume:1/8 active:1 free:7 remote:0)
        Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
          volume id:2  size:81761800  file_count:161  replica_placement:2  version:3  modified_at_second:1766349495
          ec volume id:1 collection: shards:[1 5 9] sizes:[1:8.00 MiB 5:8.00 MiB 9:8.00 MiB] total:24.00 MiB
        Disk hdd total size:81761800 file_count:161
      DataNode 192.168.10.111:9001 total size:81761800 file_count:161
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9002 hdd(volume:1/8 active:1 free:7 remote:0)
        Disk hdd(volume:1/8 active:1 free:7 remote:0) id:0
          volume id:3  size:88678712  file_count:170  replica_placement:2  version:3  modified_at_second:1766349495
          ec volume id:1 collection: shards:[11 12 13] sizes:[11:8.00 MiB 12:8.00 MiB 13:8.00 MiB] total:24.00 MiB
        Disk hdd total size:88678712 file_count:170
      DataNode 192.168.10.111:9002 total size:88678712 file_count:170
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9003 hdd(volume:2/8 active:2 free:6 remote:0)
        Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
          volume id:2  size:81761800  file_count:161  replica_placement:2  version:3  modified_at_second:1766349495
          volume id:3  size:88678712  file_count:170  replica_placement:2  version:3  modified_at_second:1766349495
          ec volume id:1 collection: shards:[0 4 8] sizes:[0:8.00 MiB 4:8.00 MiB 8:8.00 MiB] total:24.00 MiB
        Disk hdd total size:170440512 file_count:331
      DataNode 192.168.10.111:9003 total size:170440512 file_count:331
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9004 hdd(volume:2/8 active:2 free:6 remote:0)
        Disk hdd(volume:2/8 active:2 free:6 remote:0) id:0
          volume id:2  size:81761800  file_count:161  replica_placement:2  version:3  modified_at_second:1766349495
          volume id:3  size:88678712  file_count:170  replica_placement:2  version:3  modified_at_second:1766349495
          ec volume id:1 collection: shards:[2 6 10] sizes:[2:8.00 MiB 6:8.00 MiB 10:8.00 MiB] total:24.00 MiB
        Disk hdd total size:170440512 file_count:331
      DataNode 192.168.10.111:9004 total size:170440512 file_count:331
  DataCenter DefaultDataCenter hdd(volume:6/40 active:6 free:33 remote:0)
    Rack DefaultRack hdd(volume:6/40 active:6 free:33 remote:0)
      DataNode 192.168.10.111:9005 hdd(volume:0/8 active:0 free:8 remote:0)
        Disk hdd(volume:0/8 active:0 free:8 remote:0) id:0
          ec volume id:1 collection: shards:[3 7] sizes:[3:8.00 MiB 7:8.00 MiB] total:16.00 MiB
        Disk hdd total size:0 file_count:0
    Rack DefaultRack total size:511321536 file_count:993
  DataCenter DefaultDataCenter total size:511321536 file_count:993
total size:511321536 file_count:993
```
2025-12-28 19:30:42 -08:00

335 lines
10 KiB
Go

package erasure_coding
import (
"errors"
"fmt"
"os"
"slices"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/idx"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
"github.com/seaweedfs/seaweedfs/weed/storage/volume_info"
)
var (
NotFoundError = errors.New("needle not found")
destroyDelaySeconds int64 = 0
)
type EcVolume struct {
VolumeId needle.VolumeId
Collection string
dir string
dirIdx string
ecxFile *os.File
ecxFileSize int64
ecxCreatedAt time.Time
Shards []*EcVolumeShard
ShardLocations map[ShardId][]pb.ServerAddress
ShardLocationsRefreshTime time.Time
ShardLocationsLock sync.RWMutex
Version needle.Version
ecjFile *os.File
ecjFileAccessLock sync.Mutex
diskType types.DiskType
datFileSize int64
ExpireAtSec uint64 //ec volume destroy time, calculated from the ec volume was created
ECContext *ECContext // EC encoding parameters
}
func NewEcVolume(diskType types.DiskType, dir string, dirIdx string, collection string, vid needle.VolumeId) (ev *EcVolume, err error) {
ev = &EcVolume{dir: dir, dirIdx: dirIdx, Collection: collection, VolumeId: vid, diskType: diskType}
dataBaseFileName := EcShardFileName(collection, dir, int(vid))
indexBaseFileName := EcShardFileName(collection, dirIdx, int(vid))
// open ecx file
if ev.ecxFile, err = os.OpenFile(indexBaseFileName+".ecx", os.O_RDWR, 0644); err != nil {
return nil, fmt.Errorf("cannot open ec volume index %s.ecx: %v", indexBaseFileName, err)
}
ecxFi, statErr := ev.ecxFile.Stat()
if statErr != nil {
_ = ev.ecxFile.Close()
return nil, fmt.Errorf("can not stat ec volume index %s.ecx: %v", indexBaseFileName, statErr)
}
ev.ecxFileSize = ecxFi.Size()
ev.ecxCreatedAt = ecxFi.ModTime()
// open ecj file
if ev.ecjFile, err = os.OpenFile(indexBaseFileName+".ecj", os.O_RDWR|os.O_CREATE, 0644); err != nil {
return nil, fmt.Errorf("cannot open ec volume journal %s.ecj: %v", indexBaseFileName, err)
}
// read volume info
ev.Version = needle.Version3
if volumeInfo, _, found, _ := volume_info.MaybeLoadVolumeInfo(dataBaseFileName + ".vif"); found {
ev.Version = needle.Version(volumeInfo.Version)
ev.datFileSize = volumeInfo.DatFileSize
ev.ExpireAtSec = volumeInfo.ExpireAtSec
// Initialize EC context from .vif if present; fallback to defaults
if volumeInfo.EcShardConfig != nil {
ds := int(volumeInfo.EcShardConfig.DataShards)
ps := int(volumeInfo.EcShardConfig.ParityShards)
// Validate shard counts to prevent zero or invalid values
if ds <= 0 || ps <= 0 || ds+ps > MaxShardCount {
glog.Warningf("Invalid EC config in VolumeInfo for volume %d (data=%d, parity=%d), using defaults", vid, ds, ps)
ev.ECContext = NewDefaultECContext(collection, vid)
} else {
ev.ECContext = &ECContext{
Collection: collection,
VolumeId: vid,
DataShards: ds,
ParityShards: ps,
}
glog.V(1).Infof("Loaded EC config from VolumeInfo for volume %d: %s", vid, ev.ECContext.String())
}
} else {
ev.ECContext = NewDefaultECContext(collection, vid)
}
} else {
glog.Warningf("vif file not found,volumeId:%d, filename:%s", vid, dataBaseFileName)
volume_info.SaveVolumeInfo(dataBaseFileName+".vif", &volume_server_pb.VolumeInfo{Version: uint32(ev.Version)})
ev.ECContext = NewDefaultECContext(collection, vid)
}
ev.ShardLocations = make(map[ShardId][]pb.ServerAddress)
return
}
func (ev *EcVolume) AddEcVolumeShard(ecVolumeShard *EcVolumeShard) bool {
for _, s := range ev.Shards {
if s.ShardId == ecVolumeShard.ShardId {
return false
}
}
ev.Shards = append(ev.Shards, ecVolumeShard)
slices.SortFunc(ev.Shards, func(a, b *EcVolumeShard) int {
if a.VolumeId != b.VolumeId {
return int(a.VolumeId - b.VolumeId)
}
return int(a.ShardId - b.ShardId)
})
return true
}
func (ev *EcVolume) DeleteEcVolumeShard(shardId ShardId) (ecVolumeShard *EcVolumeShard, deleted bool) {
foundPosition := -1
for i, s := range ev.Shards {
if s.ShardId == shardId {
foundPosition = i
}
}
if foundPosition < 0 {
return nil, false
}
ecVolumeShard = ev.Shards[foundPosition]
ecVolumeShard.Unmount()
ev.Shards = append(ev.Shards[:foundPosition], ev.Shards[foundPosition+1:]...)
return ecVolumeShard, true
}
func (ev *EcVolume) FindEcVolumeShard(shardId ShardId) (ecVolumeShard *EcVolumeShard, found bool) {
for _, s := range ev.Shards {
if s.ShardId == shardId {
return s, true
}
}
return nil, false
}
func (ev *EcVolume) Close() {
for _, s := range ev.Shards {
s.Close()
}
if ev.ecjFile != nil {
ev.ecjFileAccessLock.Lock()
_ = ev.ecjFile.Close()
ev.ecjFile = nil
ev.ecjFileAccessLock.Unlock()
}
if ev.ecxFile != nil {
_ = ev.ecxFile.Sync()
_ = ev.ecxFile.Close()
ev.ecxFile = nil
}
}
// Sync flushes the .ecx and .ecj files to disk without closing them.
// This ensures that deletions made via DeleteNeedleFromEcx are visible
// to other processes/file handles that may read these files.
func (ev *EcVolume) Sync() {
if ev.ecjFile != nil {
ev.ecjFileAccessLock.Lock()
if err := ev.ecjFile.Sync(); err != nil {
glog.Warningf("failed to sync ecj file for volume %d: %v", ev.VolumeId, err)
}
ev.ecjFileAccessLock.Unlock()
}
if ev.ecxFile != nil {
if err := ev.ecxFile.Sync(); err != nil {
glog.Warningf("failed to sync ecx file for volume %d: %v", ev.VolumeId, err)
}
}
}
func (ev *EcVolume) Destroy() {
ev.Close()
for _, s := range ev.Shards {
s.Destroy()
}
os.Remove(ev.FileName(".ecx"))
os.Remove(ev.FileName(".ecj"))
os.Remove(ev.FileName(".vif"))
}
func (ev *EcVolume) FileName(ext string) string {
switch ext {
case ".ecx", ".ecj":
return ev.IndexBaseFileName() + ext
}
// .vif
return ev.DataBaseFileName() + ext
}
func (ev *EcVolume) DataBaseFileName() string {
return EcShardFileName(ev.Collection, ev.dir, int(ev.VolumeId))
}
func (ev *EcVolume) IndexBaseFileName() string {
return EcShardFileName(ev.Collection, ev.dirIdx, int(ev.VolumeId))
}
func (ev *EcVolume) ShardSize() uint64 {
if len(ev.Shards) > 0 {
return uint64(ev.Shards[0].Size())
}
return 0
}
func (ev *EcVolume) Size() (size uint64) {
for _, shard := range ev.Shards {
if shardSize := shard.Size(); shardSize > 0 {
size += uint64(shardSize)
}
}
return
}
func (ev *EcVolume) CreatedAt() time.Time {
return ev.ecxCreatedAt
}
func (ev *EcVolume) ShardIdList() (shardIds []ShardId) {
for _, s := range ev.Shards {
shardIds = append(shardIds, s.ShardId)
}
return
}
func (ev *EcVolume) ToVolumeEcShardInformationMessage(diskId uint32) (messages []*master_pb.VolumeEcShardInformationMessage) {
ecInfoPerVolume := map[needle.VolumeId]*master_pb.VolumeEcShardInformationMessage{}
for _, s := range ev.Shards {
m, ok := ecInfoPerVolume[s.VolumeId]
if !ok {
m = &master_pb.VolumeEcShardInformationMessage{
Id: uint32(s.VolumeId),
Collection: s.Collection,
DiskType: string(ev.diskType),
ExpireAtSec: ev.ExpireAtSec,
DiskId: diskId,
}
ecInfoPerVolume[s.VolumeId] = m
}
// Update EC shard bits and sizes.
si := ShardsInfoFromVolumeEcShardInformationMessage(m)
si.Set(s.ShardId, ShardSize(s.Size()))
m.EcIndexBits = uint32(si.Bitmap())
m.ShardSizes = si.SizesInt64()
}
for _, m := range ecInfoPerVolume {
messages = append(messages, m)
}
return
}
func (ev *EcVolume) LocateEcShardNeedle(needleId types.NeedleId, version needle.Version) (offset types.Offset, size types.Size, intervals []Interval, err error) {
// find the needle from ecx file
offset, size, err = ev.FindNeedleFromEcx(needleId)
if err != nil {
return types.Offset{}, 0, nil, fmt.Errorf("FindNeedleFromEcx: %w", err)
}
intervals = ev.LocateEcShardNeedleInterval(version, offset.ToActualOffset(), types.Size(needle.GetActualSize(size, version)))
return
}
func (ev *EcVolume) LocateEcShardNeedleInterval(version needle.Version, offset int64, size types.Size) (intervals []Interval) {
shard := ev.Shards[0]
// Usually shard will be padded to round of ErasureCodingSmallBlockSize.
// So in most cases, if shardSize equals to n * ErasureCodingLargeBlockSize,
// the data would be in small blocks.
shardSize := shard.ecdFileSize - 1
if ev.datFileSize > 0 {
// To get the correct LargeBlockRowsCount
// use datFileSize to calculate the shardSize to match the EC encoding logic.
shardSize = ev.datFileSize / int64(ev.ECContext.DataShards)
}
// calculate the locations in the ec shards
intervals = LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, shardSize, offset, types.Size(needle.GetActualSize(size, version)))
return
}
func (ev *EcVolume) FindNeedleFromEcx(needleId types.NeedleId) (offset types.Offset, size types.Size, err error) {
return SearchNeedleFromSortedIndex(ev.ecxFile, ev.ecxFileSize, needleId, nil)
}
func SearchNeedleFromSortedIndex(ecxFile *os.File, ecxFileSize int64, needleId types.NeedleId, processNeedleFn func(file *os.File, offset int64) error) (offset types.Offset, size types.Size, err error) {
var key types.NeedleId
buf := make([]byte, types.NeedleMapEntrySize)
l, h := int64(0), ecxFileSize/types.NeedleMapEntrySize
for l < h {
m := (l + h) / 2
if n, err := ecxFile.ReadAt(buf, m*types.NeedleMapEntrySize); err != nil {
if n != types.NeedleMapEntrySize {
return types.Offset{}, types.TombstoneFileSize, fmt.Errorf("ecx file %d read at %d: %v", ecxFileSize, m*types.NeedleMapEntrySize, err)
}
}
key, offset, size = idx.IdxFileEntry(buf)
if key == needleId {
if processNeedleFn != nil {
err = processNeedleFn(ecxFile, m*types.NeedleMapEntrySize)
}
return
}
if key < needleId {
l = m + 1
} else {
h = m
}
}
err = NotFoundError
return
}
func (ev *EcVolume) IsTimeToDestroy() bool {
return ev.ExpireAtSec > 0 && time.Now().Unix() > (int64(ev.ExpireAtSec)+destroyDelaySeconds)
}