Add disk-aware EC rebalancing (#7597)

* Add placement package for EC shard placement logic

- Consolidate EC shard placement algorithm for reuse across shell and worker tasks
- Support multi-pass selection: racks, then servers, then disks
- Include proper spread verification and scoring functions
- Comprehensive test coverage for various cluster topologies

* Make ec.balance disk-aware for multi-disk servers

- Add EcDisk struct to track individual disks on volume servers
- Update EcNode to maintain per-disk shard distribution
- Parse disk_id from EC shard information during topology collection
- Implement pickBestDiskOnNode() for selecting best disk per shard
- Add diskDistributionScore() for tie-breaking node selection
- Update all move operations to specify target disk in RPC calls
- Improves shard balance within multi-disk servers, not just across servers

* Use placement package in EC detection for consistent disk-level placement

- Replace custom EC disk selection logic with shared placement package
- Convert topology DiskInfo to placement.DiskCandidate format
- Use SelectDestinations() for multi-rack/server/disk spreading
- Convert placement results back to topology DiskInfo for task creation
- Ensures EC detection uses same placement logic as shell commands

* Make volume server evacuation disk-aware

- Use pickBestDiskOnNode() when selecting evacuation target disk
- Specify target disk in evacuation RPC requests
- Maintains balanced disk distribution during server evacuations

* Rename PlacementConfig to PlacementRequest for clarity

PlacementRequest better reflects that this is a request for placement
rather than a configuration object. This improves API semantics.

* Rename DefaultConfig to DefaultPlacementRequest

Aligns with the PlacementRequest type naming for consistency

* Address review comments from Gemini and CodeRabbit

Fix HIGH issues:
- Fix empty disk discovery: Now discovers all disks from VolumeInfos,
  not just from EC shards. This ensures disks without EC shards are
  still considered for placement.
- Fix EC shard count calculation in detection.go: Now correctly filters
  by DiskId and sums actual shard counts using ShardBits.ShardIdCount()
  instead of just counting EcShardInfo entries.

Fix MEDIUM issues:
- Add disk ID to evacuation log messages for consistency with other logging
- Remove unused serverToDisks variable in placement.go
- Fix comment that incorrectly said 'ascending' when sorting is 'descending'

* add ec tests

* Update ec-integration-tests.yml

* Update ec_integration_test.go

* Fix EC integration tests CI: build weed binary and update actions

- Add 'Build weed binary' step before running tests
- Update actions/setup-go from v4 to v6 (Node20 compatibility)
- Update actions/checkout from v2 to v4 (Node20 compatibility)
- Move working-directory to test step only

* Add disk-aware EC rebalancing integration tests

- Add TestDiskAwareECRebalancing test with multi-disk cluster setup
- Test EC encode with disk awareness (shows disk ID in output)
- Test EC balance with disk-level shard distribution
- Add helper functions for disk-level verification:
  - startMultiDiskCluster: 3 servers x 4 disks each
  - countShardsPerDisk: track shards per disk per server
  - calculateDiskShardVariance: measure distribution balance
- Verify no single disk is overloaded with shards
This commit is contained in:
Chris Lu
2025-12-02 12:30:15 -08:00
committed by GitHub
parent ebb06a3908
commit 4f038820dc
7 changed files with 1680 additions and 80 deletions

View File

@@ -9,6 +9,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding/placement"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
@@ -429,85 +430,100 @@ func createECTaskParams(multiPlan *topology.MultiDestinationPlan) *worker_pb.Era
}
// selectBestECDestinations selects multiple disks for EC shard placement with diversity
// Uses the consolidated placement package for proper rack/server/disk spreading
func selectBestECDestinations(disks []*topology.DiskInfo, sourceRack, sourceDC string, shardsNeeded int) []*topology.DiskInfo {
if len(disks) == 0 {
return nil
}
// Group disks by rack and DC for diversity
rackGroups := make(map[string][]*topology.DiskInfo)
for _, disk := range disks {
rackKey := fmt.Sprintf("%s:%s", disk.DataCenter, disk.Rack)
rackGroups[rackKey] = append(rackGroups[rackKey], disk)
}
var selected []*topology.DiskInfo
usedRacks := make(map[string]bool)
// First pass: select one disk from each rack for maximum diversity
for rackKey, rackDisks := range rackGroups {
if len(selected) >= shardsNeeded {
break
}
// Select best disk from this rack
bestDisk := selectBestFromRack(rackDisks, sourceRack, sourceDC)
if bestDisk != nil {
selected = append(selected, bestDisk)
usedRacks[rackKey] = true
}
}
// Second pass: if we need more disks, select from racks we've already used
if len(selected) < shardsNeeded {
for _, disk := range disks {
if len(selected) >= shardsNeeded {
break
}
// Skip if already selected
alreadySelected := false
for _, sel := range selected {
if sel.NodeID == disk.NodeID && sel.DiskID == disk.DiskID {
alreadySelected = true
break
}
}
if !alreadySelected && isDiskSuitableForEC(disk) {
selected = append(selected, disk)
}
}
}
return selected
}
// selectBestFromRack selects the best disk from a rack for EC placement
func selectBestFromRack(disks []*topology.DiskInfo, sourceRack, sourceDC string) *topology.DiskInfo {
if len(disks) == 0 {
// Convert topology.DiskInfo to placement.DiskCandidate
candidates := diskInfosToCandidates(disks)
if len(candidates) == 0 {
return nil
}
var bestDisk *topology.DiskInfo
bestScore := -1.0
// Configure placement for EC shards
config := placement.PlacementRequest{
ShardsNeeded: shardsNeeded,
MaxShardsPerServer: 0, // No hard limit, but prefer spreading
MaxShardsPerRack: 0, // No hard limit, but prefer spreading
MaxTaskLoad: topology.MaxTaskLoadForECPlacement,
PreferDifferentServers: true,
PreferDifferentRacks: true,
}
// Use the shared placement algorithm
result, err := placement.SelectDestinations(candidates, config)
if err != nil {
glog.V(2).Infof("EC placement failed: %v", err)
return nil
}
// Convert back to topology.DiskInfo
return candidatesToDiskInfos(result.SelectedDisks, disks)
}
// diskInfosToCandidates converts topology.DiskInfo slice to placement.DiskCandidate slice
func diskInfosToCandidates(disks []*topology.DiskInfo) []*placement.DiskCandidate {
var candidates []*placement.DiskCandidate
for _, disk := range disks {
if !isDiskSuitableForEC(disk) {
if disk.DiskInfo == nil {
continue
}
score := calculateECScore(disk, sourceRack, sourceDC)
if score > bestScore {
bestScore = score
bestDisk = disk
// Calculate free slots (using default max if not set)
freeSlots := int(disk.DiskInfo.MaxVolumeCount - disk.DiskInfo.VolumeCount)
if freeSlots < 0 {
freeSlots = 0
}
// Calculate EC shard count for this specific disk
// EcShardInfos contains all shards, so we need to filter by DiskId and sum actual shard counts
ecShardCount := 0
if disk.DiskInfo.EcShardInfos != nil {
for _, shardInfo := range disk.DiskInfo.EcShardInfos {
if shardInfo.DiskId == disk.DiskID {
ecShardCount += erasure_coding.ShardBits(shardInfo.EcIndexBits).ShardIdCount()
}
}
}
candidates = append(candidates, &placement.DiskCandidate{
NodeID: disk.NodeID,
DiskID: disk.DiskID,
DataCenter: disk.DataCenter,
Rack: disk.Rack,
VolumeCount: disk.DiskInfo.VolumeCount,
MaxVolumeCount: disk.DiskInfo.MaxVolumeCount,
ShardCount: ecShardCount,
FreeSlots: freeSlots,
LoadCount: disk.LoadCount,
})
}
return candidates
}
// candidatesToDiskInfos converts placement results back to topology.DiskInfo
func candidatesToDiskInfos(candidates []*placement.DiskCandidate, originalDisks []*topology.DiskInfo) []*topology.DiskInfo {
// Create a map for quick lookup
diskMap := make(map[string]*topology.DiskInfo)
for _, disk := range originalDisks {
key := fmt.Sprintf("%s:%d", disk.NodeID, disk.DiskID)
diskMap[key] = disk
}
return bestDisk
var result []*topology.DiskInfo
for _, candidate := range candidates {
key := fmt.Sprintf("%s:%d", candidate.NodeID, candidate.DiskID)
if disk, ok := diskMap[key]; ok {
result = append(result, disk)
}
}
return result
}
// calculateECScore calculates placement score for EC operations
// Used for logging and plan metadata
func calculateECScore(disk *topology.DiskInfo, sourceRack, sourceDC string) float64 {
if disk.DiskInfo == nil {
return 0.0
@@ -524,14 +540,12 @@ func calculateECScore(disk *topology.DiskInfo, sourceRack, sourceDC string) floa
// Consider current load (secondary factor)
score += (10.0 - float64(disk.LoadCount)) // Up to 10 points for low load
// Note: We don't penalize placing shards on the same rack/DC as source
// since the original volume will be deleted after EC conversion.
// This allows for better network efficiency and storage utilization.
return score
}
// isDiskSuitableForEC checks if a disk is suitable for EC placement
// Note: This is kept for backward compatibility but the placement package
// handles filtering internally
func isDiskSuitableForEC(disk *topology.DiskInfo) bool {
if disk.DiskInfo == nil {
return false