optimize: enable immediate EC shard reporting during startup (#7933)
* optimize: enable immediate EC shard reporting during startup
Ported the immediate EC shard reporting feature from Enterprise to Community version.
This allows the master to be notified about EC shards immediately during volume server startup,
instead of waiting for the first heartbeat.
Changes:
1. Updated NewStore to initialize notification channels BEFORE loading volumes (fixes potential nil panic).
2. Added ecShardNotifyHandler to report EC shards to NewEcShardsChan during startup.
3. Implemented non-blocking channel send for EC reporting to prevent deadlock when loading many EC shards (fixing the enterprise bug 17ac1290c).
4. Updated DiskLocation and EC loading logic to support the callback.
This optimization improves cluster state consistency and startup speed for EC-heavy clusters.
* optimize: report actual EC shard size during startup
* optimize: increase notification channel buffer size to 1024
* optimize: fix variable shadowing in store.go
This commit is contained in:
@@ -86,6 +86,12 @@ func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int,
|
||||
s = &Store{grpcDialOption: grpcDialOption, Port: port, Ip: ip, GrpcPort: grpcPort, PublicUrl: publicUrl, Id: id, NeedleMapKind: needleMapKind}
|
||||
s.Locations = make([]*DiskLocation, 0)
|
||||
|
||||
s.NewVolumesChan = make(chan master_pb.VolumeShortInformationMessage, 1024)
|
||||
s.DeletedVolumesChan = make(chan master_pb.VolumeShortInformationMessage, 1024)
|
||||
|
||||
s.NewEcShardsChan = make(chan master_pb.VolumeEcShardInformationMessage, 1024)
|
||||
s.DeletedEcShardsChan = make(chan master_pb.VolumeEcShardInformationMessage, 1024)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < len(dirnames); i++ {
|
||||
location := NewDiskLocation(dirnames[i], int32(maxVolumeCounts[i]), minFreeSpaces[i], idxFolder, diskTypes[i])
|
||||
@@ -93,6 +99,33 @@ func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int,
|
||||
stats.VolumeServerMaxVolumeCounter.Add(float64(maxVolumeCounts[i]))
|
||||
|
||||
diskId := uint32(i) // Track disk ID
|
||||
|
||||
location.ecShardNotifyHandler = func(collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, ecVolume *erasure_coding.EcVolume) {
|
||||
var shardSize int64
|
||||
if shard, found := ecVolume.FindEcVolumeShard(shardId); found {
|
||||
shardSize = shard.Size()
|
||||
}
|
||||
si := erasure_coding.NewShardsInfo()
|
||||
si.Set(shardId, erasure_coding.ShardSize(shardSize))
|
||||
|
||||
// Use non-blocking send during startup to avoid deadlock
|
||||
// The channel reader only starts after connecting to master, but we're loading during startup
|
||||
select {
|
||||
case s.NewEcShardsChan <- master_pb.VolumeEcShardInformationMessage{
|
||||
Id: uint32(vid),
|
||||
Collection: collection,
|
||||
EcIndexBits: si.Bitmap(),
|
||||
ShardSizes: si.SizesInt64(),
|
||||
DiskType: string(location.DiskType),
|
||||
ExpireAtSec: ecVolume.ExpireAtSec,
|
||||
DiskId: diskId,
|
||||
}:
|
||||
default:
|
||||
// Channel full during startup - this is OK, heartbeat will report EC shards later
|
||||
glog.V(2).Infof("NewEcShardsChan full during startup for shard %d.%d, will be reported in heartbeat", vid, shardId)
|
||||
}
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
go func(id uint32, diskLoc *DiskLocation) {
|
||||
defer wg.Done()
|
||||
@@ -101,12 +134,6 @@ func NewStore(grpcDialOption grpc.DialOption, ip string, port int, grpcPort int,
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
s.NewVolumesChan = make(chan master_pb.VolumeShortInformationMessage, 3)
|
||||
s.DeletedVolumesChan = make(chan master_pb.VolumeShortInformationMessage, 3)
|
||||
|
||||
s.NewEcShardsChan = make(chan master_pb.VolumeEcShardInformationMessage, 3)
|
||||
s.DeletedEcShardsChan = make(chan master_pb.VolumeEcShardInformationMessage, 3)
|
||||
|
||||
return
|
||||
}
|
||||
func (s *Store) AddVolume(volumeId needle.VolumeId, collection string, needleMapKind NeedleMapKind, replicaPlacement string, ttlString string, preallocate int64, ver needle.Version, MemoryMapMaxSizeMb uint32, diskType DiskType, ldbTimeout int64) error {
|
||||
|
||||
Reference in New Issue
Block a user