Parallelize ec.rebuild operations per affected volume. (#7466)
* Parallelize `ec.rebuild` operations per affected volume. * node.freeEcSlot >= slotsNeeded * variable names, help messages, * Protected the read operation with the same mutex * accurate error message * fix broken test --------- Co-authored-by: chrislu <chris.lu@gmail.com> Co-authored-by: Chris Lu <chrislusf@users.noreply.github.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"sync"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/operation"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb"
|
||||
@@ -18,12 +19,14 @@ func init() {
|
||||
}
|
||||
|
||||
type ecRebuilder struct {
|
||||
// TODO: add ErrorWaitGroup for parallelization
|
||||
commandEnv *CommandEnv
|
||||
ecNodes []*EcNode
|
||||
writer io.Writer
|
||||
applyChanges bool
|
||||
collections []string
|
||||
|
||||
ewg *ErrorWaitGroup
|
||||
ecNodesMu sync.Mutex
|
||||
}
|
||||
|
||||
type commandEcRebuild struct {
|
||||
@@ -36,7 +39,14 @@ func (c *commandEcRebuild) Name() string {
|
||||
func (c *commandEcRebuild) Help() string {
|
||||
return `find and rebuild missing ec shards among volume servers
|
||||
|
||||
ec.rebuild [-c EACH_COLLECTION|<collection_name>] [-apply]
|
||||
ec.rebuild [-c EACH_COLLECTION|<collection_name>] [-apply] [-maxParallelization N]
|
||||
|
||||
Options:
|
||||
-collection: specify a collection name, or "EACH_COLLECTION" to process all collections
|
||||
-apply: actually perform the rebuild operations (default is dry-run mode)
|
||||
-maxParallelization: number of volumes to rebuild concurrently (default: 10)
|
||||
Increase for faster rebuilds with more system resources.
|
||||
Decrease if experiencing resource contention or instability.
|
||||
|
||||
Algorithm:
|
||||
|
||||
@@ -71,13 +81,13 @@ func (c *commandEcRebuild) Do(args []string, commandEnv *CommandEnv, writer io.W
|
||||
|
||||
fixCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
|
||||
collection := fixCommand.String("collection", "EACH_COLLECTION", "collection name, or \"EACH_COLLECTION\" for each collection")
|
||||
maxParallelization := fixCommand.Int("maxParallelization", DefaultMaxParallelization, "run up to X tasks in parallel, whenever possible")
|
||||
applyChanges := fixCommand.Bool("apply", false, "apply the changes")
|
||||
// TODO: remove this alias
|
||||
applyChangesAlias := fixCommand.Bool("force", false, "apply the changes (alias for -apply)")
|
||||
if err = fixCommand.Parse(args); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
handleDeprecatedForceFlag(writer, fixCommand, applyChangesAlias, applyChanges)
|
||||
infoAboutSimulationMode(writer, *applyChanges, "-apply")
|
||||
|
||||
@@ -107,17 +117,16 @@ func (c *commandEcRebuild) Do(args []string, commandEnv *CommandEnv, writer io.W
|
||||
writer: writer,
|
||||
applyChanges: *applyChanges,
|
||||
collections: collections,
|
||||
|
||||
ewg: NewErrorWaitGroup(*maxParallelization),
|
||||
}
|
||||
|
||||
fmt.Printf("rebuildEcVolumes for %d collection(s)\n", len(collections))
|
||||
for _, c := range collections {
|
||||
fmt.Printf("rebuildEcVolumes collection %s\n", c)
|
||||
if err = erb.rebuildEcVolumes(c); err != nil {
|
||||
return err
|
||||
}
|
||||
erb.rebuildEcVolumes(c)
|
||||
}
|
||||
|
||||
return nil
|
||||
return erb.ewg.Wait()
|
||||
}
|
||||
|
||||
func (erb *ecRebuilder) write(format string, a ...any) {
|
||||
@@ -128,30 +137,87 @@ func (erb *ecRebuilder) isLocked() bool {
|
||||
return erb.commandEnv.isLocked()
|
||||
}
|
||||
|
||||
// ecNodeWithMoreFreeSlots returns the EC node with higher free slot count, from all nodes visible to the rebuilder.
|
||||
func (erb *ecRebuilder) ecNodeWithMoreFreeSlots() *EcNode {
|
||||
// countLocalShards returns the number of shards already present locally on the node for the given volume.
|
||||
func (erb *ecRebuilder) countLocalShards(node *EcNode, collection string, volumeId needle.VolumeId) int {
|
||||
for _, diskInfo := range node.info.DiskInfos {
|
||||
for _, ecShardInfo := range diskInfo.EcShardInfos {
|
||||
if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
|
||||
shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits)
|
||||
return len(shardBits.ShardIds())
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// selectAndReserveRebuilder atomically selects a rebuilder node with sufficient free slots
|
||||
// and reserves slots only for the non-local shards that need to be copied/generated.
|
||||
func (erb *ecRebuilder) selectAndReserveRebuilder(collection string, volumeId needle.VolumeId) (*EcNode, int, error) {
|
||||
erb.ecNodesMu.Lock()
|
||||
defer erb.ecNodesMu.Unlock()
|
||||
|
||||
if len(erb.ecNodes) == 0 {
|
||||
return nil
|
||||
return nil, 0, fmt.Errorf("no ec nodes available")
|
||||
}
|
||||
|
||||
res := erb.ecNodes[0]
|
||||
for i := 1; i < len(erb.ecNodes); i++ {
|
||||
if erb.ecNodes[i].freeEcSlot > res.freeEcSlot {
|
||||
res = erb.ecNodes[i]
|
||||
// Find the node with the most free slots, considering local shards
|
||||
var bestNode *EcNode
|
||||
var bestSlotsNeeded int
|
||||
var maxAvailableSlots int
|
||||
var minSlotsNeeded int = erasure_coding.TotalShardsCount // Start with maximum possible
|
||||
for _, node := range erb.ecNodes {
|
||||
localShards := erb.countLocalShards(node, collection, volumeId)
|
||||
slotsNeeded := erasure_coding.TotalShardsCount - localShards
|
||||
if slotsNeeded < 0 {
|
||||
slotsNeeded = 0
|
||||
}
|
||||
|
||||
if node.freeEcSlot > maxAvailableSlots {
|
||||
maxAvailableSlots = node.freeEcSlot
|
||||
}
|
||||
|
||||
if slotsNeeded < minSlotsNeeded {
|
||||
minSlotsNeeded = slotsNeeded
|
||||
}
|
||||
|
||||
if node.freeEcSlot >= slotsNeeded {
|
||||
if bestNode == nil || node.freeEcSlot > bestNode.freeEcSlot {
|
||||
bestNode = node
|
||||
bestSlotsNeeded = slotsNeeded
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res
|
||||
if bestNode == nil {
|
||||
return nil, 0, fmt.Errorf("no node has sufficient free slots for volume %d (need at least %d slots, max available: %d)",
|
||||
volumeId, minSlotsNeeded, maxAvailableSlots)
|
||||
}
|
||||
|
||||
// Reserve slots only for non-local shards
|
||||
bestNode.freeEcSlot -= bestSlotsNeeded
|
||||
|
||||
return bestNode, bestSlotsNeeded, nil
|
||||
}
|
||||
|
||||
func (erb *ecRebuilder) rebuildEcVolumes(collection string) error {
|
||||
fmt.Printf("rebuildEcVolumes %s\n", collection)
|
||||
// releaseRebuilder releases the reserved slots back to the rebuilder node.
|
||||
func (erb *ecRebuilder) releaseRebuilder(node *EcNode, slotsToRelease int) {
|
||||
erb.ecNodesMu.Lock()
|
||||
defer erb.ecNodesMu.Unlock()
|
||||
|
||||
// Release slots by incrementing the free slot count
|
||||
node.freeEcSlot += slotsToRelease
|
||||
}
|
||||
|
||||
func (erb *ecRebuilder) rebuildEcVolumes(collection string) {
|
||||
fmt.Printf("rebuildEcVolumes for %q\n", collection)
|
||||
|
||||
// collect vid => each shard locations, similar to ecShardMap in topology.go
|
||||
ecShardMap := make(EcShardMap)
|
||||
erb.ecNodesMu.Lock()
|
||||
for _, ecNode := range erb.ecNodes {
|
||||
ecShardMap.registerEcNode(ecNode, collection)
|
||||
}
|
||||
erb.ecNodesMu.Unlock()
|
||||
|
||||
for vid, locations := range ecShardMap {
|
||||
shardCount := locations.shardCount()
|
||||
@@ -159,31 +225,37 @@ func (erb *ecRebuilder) rebuildEcVolumes(collection string) error {
|
||||
continue
|
||||
}
|
||||
if shardCount < erasure_coding.DataShardsCount {
|
||||
return fmt.Errorf("ec volume %d is unrepairable with %d shards", vid, shardCount)
|
||||
// Capture variables for closure
|
||||
vid := vid
|
||||
shardCount := shardCount
|
||||
erb.ewg.Add(func() error {
|
||||
return fmt.Errorf("ec volume %d is unrepairable with %d shards", vid, shardCount)
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
if err := erb.rebuildOneEcVolume(collection, vid, locations); err != nil {
|
||||
return err
|
||||
}
|
||||
// Capture variables for closure
|
||||
vid := vid
|
||||
locations := locations
|
||||
|
||||
erb.ewg.Add(func() error {
|
||||
// Select rebuilder and reserve slots atomically per volume
|
||||
rebuilder, slotsToReserve, err := erb.selectAndReserveRebuilder(collection, vid)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to select rebuilder for volume %d: %v", vid, err)
|
||||
}
|
||||
defer erb.releaseRebuilder(rebuilder, slotsToReserve)
|
||||
|
||||
return erb.rebuildOneEcVolume(collection, vid, locations, rebuilder)
|
||||
})
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.VolumeId, locations EcShardLocations) error {
|
||||
func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.VolumeId, locations EcShardLocations, rebuilder *EcNode) error {
|
||||
if !erb.isLocked() {
|
||||
return fmt.Errorf("lock is lost")
|
||||
}
|
||||
|
||||
// TODO: fix this logic so it supports concurrent executions
|
||||
rebuilder := erb.ecNodeWithMoreFreeSlots()
|
||||
if rebuilder == nil {
|
||||
return fmt.Errorf("no EC nodes available for rebuild")
|
||||
}
|
||||
if rebuilder.freeEcSlot < erasure_coding.TotalShardsCount {
|
||||
return fmt.Errorf("disk space is not enough")
|
||||
}
|
||||
|
||||
fmt.Printf("rebuildOneEcVolume %s %d\n", collection, volumeId)
|
||||
|
||||
// collect shard files to rebuilder local disk
|
||||
@@ -219,6 +291,9 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
|
||||
return err
|
||||
}
|
||||
|
||||
// ensure ECNode updates are atomic
|
||||
erb.ecNodesMu.Lock()
|
||||
defer erb.ecNodesMu.Unlock()
|
||||
rebuilder.addEcVolumeShards(volumeId, collection, generatedShardIds)
|
||||
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user