package ec_balance import ( "context" "fmt" "sort" "time" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" "github.com/seaweedfs/seaweedfs/weed/util/wildcard" "github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" "github.com/seaweedfs/seaweedfs/weed/worker/types" ) // ecNodeInfo represents a volume server with EC shard information for detection type ecNodeInfo struct { nodeID string address string dc string rack string // dc:rack composite key freeSlots int // volumeID -> shardBits (bitmask of shard IDs present on this node) ecShards map[uint32]*ecVolumeInfo } type ecVolumeInfo struct { collection string shardBits uint32 // bitmask diskID uint32 } // ecRackInfo represents a rack with EC node information type ecRackInfo struct { nodes map[string]*ecNodeInfo freeSlots int } // shardMove represents a proposed EC shard move type shardMove struct { volumeID uint32 shardID int collection string source *ecNodeInfo sourceDisk uint32 target *ecNodeInfo targetDisk uint32 phase string // "dedup", "cross_rack", "within_rack", "global" } // Detection implements the multi-phase EC shard balance detection algorithm. // It analyzes EC shard distribution and proposes moves to achieve even distribution. func Detection( ctx context.Context, metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterInfo, config base.TaskConfig, maxResults int, ) ([]*types.TaskDetectionResult, bool, error) { if !config.IsEnabled() { return nil, false, nil } ecConfig := config.(*Config) if maxResults < 0 { maxResults = 0 } if clusterInfo == nil || clusterInfo.ActiveTopology == nil { return nil, false, fmt.Errorf("active topology not available for EC balance detection") } topoInfo := clusterInfo.ActiveTopology.GetTopologyInfo() if topoInfo == nil { return nil, false, fmt.Errorf("topology info not available") } // Build EC topology view nodes, racks := buildECTopology(topoInfo, ecConfig) if len(nodes) < ecConfig.MinServerCount { glog.V(1).Infof("EC balance: only %d servers, need at least %d", len(nodes), ecConfig.MinServerCount) return nil, false, nil } // Collect all EC volumes grouped by collection collections := collectECCollections(nodes, ecConfig) if len(collections) == 0 { glog.V(1).Infof("EC balance: no EC volumes found matching filters") return nil, false, nil } threshold := ecConfig.ImbalanceThreshold var allMoves []*shardMove // Build set of allowed collections for global phase filtering allowedVids := make(map[uint32]bool) for _, volumeIDs := range collections { for _, vid := range volumeIDs { allowedVids[vid] = true } } for collection, volumeIDs := range collections { if ctx != nil { if err := ctx.Err(); err != nil { return nil, false, err } } // Phase 1: Detect duplicate shards (always run, duplicates are errors not imbalance) for _, vid := range volumeIDs { moves := detectDuplicateShards(vid, collection, nodes, ecConfig.DiskType) applyMovesToTopology(moves) allMoves = append(allMoves, moves...) } // Phase 2: Balance shards across racks (operates on updated topology from phase 1) for _, vid := range volumeIDs { moves := detectCrossRackImbalance(vid, collection, nodes, racks, ecConfig.DiskType, threshold) applyMovesToTopology(moves) allMoves = append(allMoves, moves...) } // Phase 3: Balance shards within racks (operates on updated topology from phases 1-2) for _, vid := range volumeIDs { moves := detectWithinRackImbalance(vid, collection, nodes, racks, ecConfig.DiskType, threshold) applyMovesToTopology(moves) allMoves = append(allMoves, moves...) } } // Phase 4: Global node balance across racks (only for volumes in allowed collections) globalMoves := detectGlobalImbalance(nodes, racks, ecConfig, allowedVids) allMoves = append(allMoves, globalMoves...) // Cap results hasMore := false if maxResults > 0 && len(allMoves) > maxResults { allMoves = allMoves[:maxResults] hasMore = true } // Convert moves to TaskDetectionResults now := time.Now() results := make([]*types.TaskDetectionResult, 0, len(allMoves)) for i, move := range allMoves { // Include loop index and source/target in TaskID for uniqueness taskID := fmt.Sprintf("ec_balance_%d_%d_%s_%s_%d_%d", move.volumeID, move.shardID, move.source.nodeID, move.target.nodeID, now.UnixNano(), i) result := &types.TaskDetectionResult{ TaskID: taskID, TaskType: types.TaskTypeECBalance, VolumeID: move.volumeID, Server: move.source.nodeID, Collection: move.collection, Priority: movePhasePriority(move.phase), Reason: fmt.Sprintf("EC shard %d.%d %s: %s → %s (%s)", move.volumeID, move.shardID, move.phase, move.source.nodeID, move.target.nodeID, move.phase), ScheduleAt: now, TypedParams: &worker_pb.TaskParams{ TaskId: taskID, VolumeId: move.volumeID, Collection: move.collection, Sources: []*worker_pb.TaskSource{{ Node: move.source.address, DiskId: move.sourceDisk, Rack: move.source.rack, ShardIds: []uint32{uint32(move.shardID)}, }}, Targets: []*worker_pb.TaskTarget{{ Node: move.target.address, DiskId: move.targetDisk, Rack: move.target.rack, ShardIds: []uint32{uint32(move.shardID)}, }}, TaskParams: &worker_pb.TaskParams_EcBalanceParams{ EcBalanceParams: &worker_pb.EcBalanceTaskParams{ DiskType: ecConfig.DiskType, TimeoutSeconds: 600, }, }, }, } results = append(results, result) } glog.V(1).Infof("EC balance detection: %d moves proposed across %d collections", len(results), len(collections)) return results, hasMore, nil } // buildECTopology constructs EC node and rack structures from topology info. // Rack keys are dc:rack composites to avoid cross-DC name collisions. // Only racks with eligible nodes (matching disk type, having EC shards or capacity) are included. func buildECTopology(topoInfo *master_pb.TopologyInfo, config *Config) (map[string]*ecNodeInfo, map[string]*ecRackInfo) { nodes := make(map[string]*ecNodeInfo) racks := make(map[string]*ecRackInfo) for _, dc := range topoInfo.DataCenterInfos { if config.DataCenterFilter != "" { matchers := wildcard.CompileWildcardMatchers(config.DataCenterFilter) if !wildcard.MatchesAnyWildcard(matchers, dc.Id) { continue } } for _, rack := range dc.RackInfos { // Use dc:rack composite key to avoid cross-DC name collisions rackKey := dc.Id + ":" + rack.Id for _, dn := range rack.DataNodeInfos { node := &ecNodeInfo{ nodeID: dn.Id, address: dn.Id, dc: dc.Id, rack: rackKey, ecShards: make(map[uint32]*ecVolumeInfo), } hasMatchingDisk := false for diskType, diskInfo := range dn.DiskInfos { if config.DiskType != "" && diskType != config.DiskType { continue } hasMatchingDisk = true freeSlots := int(diskInfo.MaxVolumeCount-diskInfo.VolumeCount)*erasure_coding.DataShardsCount - countEcShards(diskInfo.EcShardInfos) if freeSlots > 0 { node.freeSlots += freeSlots } for _, ecShardInfo := range diskInfo.EcShardInfos { vid := ecShardInfo.Id existing, ok := node.ecShards[vid] if !ok { existing = &ecVolumeInfo{ collection: ecShardInfo.Collection, diskID: ecShardInfo.DiskId, } node.ecShards[vid] = existing } existing.shardBits |= ecShardInfo.EcIndexBits } } if !hasMatchingDisk { continue } nodes[dn.Id] = node // Only create rack entry when we have an eligible node if _, ok := racks[rackKey]; !ok { racks[rackKey] = &ecRackInfo{nodes: make(map[string]*ecNodeInfo)} } racks[rackKey].nodes[dn.Id] = node racks[rackKey].freeSlots += node.freeSlots } } } return nodes, racks } // collectECCollections groups EC volume IDs by collection, applying filters func collectECCollections(nodes map[string]*ecNodeInfo, config *Config) map[string][]uint32 { allowedCollections := wildcard.CompileWildcardMatchers(config.CollectionFilter) // Collect unique volume IDs per collection collectionVids := make(map[string]map[uint32]bool) for _, node := range nodes { for vid, info := range node.ecShards { if len(allowedCollections) > 0 && !wildcard.MatchesAnyWildcard(allowedCollections, info.collection) { continue } if _, ok := collectionVids[info.collection]; !ok { collectionVids[info.collection] = make(map[uint32]bool) } collectionVids[info.collection][vid] = true } } // Convert to sorted slices result := make(map[string][]uint32, len(collectionVids)) for collection, vids := range collectionVids { vidSlice := make([]uint32, 0, len(vids)) for vid := range vids { vidSlice = append(vidSlice, vid) } sort.Slice(vidSlice, func(i, j int) bool { return vidSlice[i] < vidSlice[j] }) result[collection] = vidSlice } return result } // detectDuplicateShards finds shards that exist on multiple nodes. // Duplicates are always returned regardless of threshold since they are data errors. func detectDuplicateShards(vid uint32, collection string, nodes map[string]*ecNodeInfo, diskType string) []*shardMove { // Build shard -> list of nodes mapping shardLocations := make(map[int][]*ecNodeInfo) for _, node := range nodes { info, ok := node.ecShards[vid] if !ok { continue } for shardID := 0; shardID < erasure_coding.MaxShardCount; shardID++ { if info.shardBits&(1< 0 { rackShardCount[node.rack] += count rackShardNodes[node.rack] = append(rackShardNodes[node.rack], node) totalShards += count } } if totalShards == 0 { return nil } // Check if imbalance exceeds threshold if !exceedsImbalanceThreshold(rackShardCount, totalShards, numRacks, threshold) { return nil } maxPerRack := ceilDivide(totalShards, numRacks) var moves []*shardMove // Find over-loaded racks and move excess shards to under-loaded racks for rackID, count := range rackShardCount { if count <= maxPerRack { continue } excess := count - maxPerRack movedFromRack := 0 // Find shards to move from this rack for _, node := range rackShardNodes[rackID] { if movedFromRack >= excess { break } info := node.ecShards[vid] for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ { if movedFromRack >= excess { break } if info.shardBits&(1<= excess { break } if info.shardBits&(1< 0 && !allowedVids[vid] { continue } count += shardBitCount(info.shardBits) } nodeShardCounts[nodeID] = count totalShards += count } if totalShards == 0 { continue } // Check if imbalance exceeds threshold if !exceedsImbalanceThreshold(nodeShardCounts, totalShards, len(rack.nodes), config.ImbalanceThreshold) { continue } avgShards := ceilDivide(totalShards, len(rack.nodes)) // Iteratively move shards from most-loaded to least-loaded for i := 0; i < 10; i++ { // cap iterations to avoid infinite loops // Find min and max nodes, skipping full nodes for min var minNode, maxNode *ecNodeInfo minCount, maxCount := totalShards+1, -1 for nodeID, count := range nodeShardCounts { node := rack.nodes[nodeID] if count < minCount && node.freeSlots > 0 { minCount = count minNode = node } if count > maxCount { maxCount = count maxNode = rack.nodes[nodeID] } } if maxNode == nil || minNode == nil || maxNode.nodeID == minNode.nodeID { break } if maxCount <= avgShards || minCount+1 > avgShards { break } if maxCount-minCount <= 1 { break } // Pick a shard from maxNode that doesn't already exist on minNode moved := false for vid, info := range maxNode.ecShards { if moved { break } if len(allowedVids) > 0 && !allowedVids[vid] { continue } // Check minNode doesn't have this volume's shards already (avoid same-volume overlap) minInfo := minNode.ecShards[vid] for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ { if info.shardBits&(1<= maxPerRack { continue } if rack.freeSlots <= 0 { continue } for _, node := range rack.nodes { if node.freeSlots <= 0 { continue } if node.freeSlots > bestFreeSlots { bestFreeSlots = node.freeSlots bestNode = node } } } return bestNode } // findLeastLoadedNodeInRack finds the node with fewest shards in a rack func findLeastLoadedNodeInRack(vid uint32, rack *ecRackInfo, excludeNode string, nodeShardCount map[string]int, maxPerNode int) *ecNodeInfo { var bestNode *ecNodeInfo bestCount := maxPerNode + 1 for nodeID, node := range rack.nodes { if nodeID == excludeNode { continue } if node.freeSlots <= 0 { continue } count := nodeShardCount[nodeID] if count >= maxPerNode { continue } if count < bestCount { bestCount = count bestNode = node } } return bestNode } // exceedsImbalanceThreshold checks if the distribution of counts exceeds the threshold. // numGroups is the total number of groups (including those with 0 shards that aren't in the map). // imbalanceRatio = (maxCount - minCount) / avgCount func exceedsImbalanceThreshold(counts map[string]int, total int, numGroups int, threshold float64) bool { if numGroups <= 1 || total == 0 { return false } minCount := 0 // groups not in map have 0 shards if len(counts) >= numGroups { // All groups have entries; find actual min minCount = total + 1 for _, count := range counts { if count < minCount { minCount = count } } } maxCount := -1 for _, count := range counts { if count > maxCount { maxCount = count } } avg := float64(total) / float64(numGroups) if avg == 0 { return false } imbalanceRatio := float64(maxCount-minCount) / avg return imbalanceRatio > threshold } // applyMovesToTopology simulates planned moves on the in-memory topology // so subsequent detection phases see updated shard placement. func applyMovesToTopology(moves []*shardMove) { for _, move := range moves { shardBit := uint32(1 << uint(move.shardID)) // Remove shard from source if srcInfo, ok := move.source.ecShards[move.volumeID]; ok { srcInfo.shardBits &^= shardBit } // For non-dedup moves, add shard to target if move.source.nodeID != move.target.nodeID { dstInfo, ok := move.target.ecShards[move.volumeID] if !ok { dstInfo = &ecVolumeInfo{ collection: move.collection, diskID: move.targetDisk, } move.target.ecShards[move.volumeID] = dstInfo } dstInfo.shardBits |= shardBit } } } // Helper functions func countEcShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) int { count := 0 for _, eci := range ecShardInfos { count += erasure_coding.GetShardCount(eci) } return count } func shardBitCount(bits uint32) int { count := 0 for bits != 0 { count += int(bits & 1) bits >>= 1 } return count } func ecShardDiskID(node *ecNodeInfo, vid uint32) uint32 { if info, ok := node.ecShards[vid]; ok { return info.diskID } return 0 } func ceilDivide(a, b int) int { if b == 0 { return 0 } return (a + b - 1) / b } func movePhasePriority(phase string) types.TaskPriority { switch phase { case "dedup": return types.TaskPriorityHigh case "cross_rack": return types.TaskPriorityMedium default: return types.TaskPriorityLow } }