seaweedFS/weed/worker/tasks/ec_balance/detection.go

package ec_balance

import (
	"context"
	"fmt"
	"sort"
	"time"

	"github.com/seaweedfs/seaweedfs/weed/glog"
	"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
	"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
	"github.com/seaweedfs/seaweedfs/weed/util/wildcard"
	"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
	"github.com/seaweedfs/seaweedfs/weed/worker/types"
)

// ecNodeInfo represents a volume server with EC shard information for detection
type ecNodeInfo struct {
	nodeID    string
	address   string
	dc        string
	rack      string // dc:rack composite key
	freeSlots int
	// volumeID -> shardBits (bitmask of shard IDs present on this node)
	ecShards map[uint32]*ecVolumeInfo
}

type ecVolumeInfo struct {
	collection string
	shardBits  uint32 // bitmask
	diskID     uint32
}

// ecRackInfo represents a rack with EC node information
type ecRackInfo struct {
	nodes     map[string]*ecNodeInfo
	freeSlots int
}

// shardMove represents a proposed EC shard move
type shardMove struct {
	volumeID   uint32
	shardID    int
	collection string
	source     *ecNodeInfo
	sourceDisk uint32
	target     *ecNodeInfo
	targetDisk uint32
	phase      string // "dedup", "cross_rack", "within_rack", "global"
}

// Detection implements the multi-phase EC shard balance detection algorithm.
// It analyzes EC shard distribution and proposes moves to achieve even distribution.
func Detection(
	ctx context.Context,
	metrics []*types.VolumeHealthMetrics,
	clusterInfo *types.ClusterInfo,
	config base.TaskConfig,
	maxResults int,
) ([]*types.TaskDetectionResult, bool, error) {
	if !config.IsEnabled() {
		return nil, false, nil
	}

	ecConfig := config.(*Config)
	if maxResults < 0 {
		maxResults = 0
	}

	if clusterInfo == nil || clusterInfo.ActiveTopology == nil {
		return nil, false, fmt.Errorf("active topology not available for EC balance detection")
	}

	topoInfo := clusterInfo.ActiveTopology.GetTopologyInfo()
	if topoInfo == nil {
		return nil, false, fmt.Errorf("topology info not available")
	}

	// Build EC topology view
	nodes, racks := buildECTopology(topoInfo, ecConfig)

	if len(nodes) < ecConfig.MinServerCount {
		glog.V(1).Infof("EC balance: only %d servers, need at least %d", len(nodes), ecConfig.MinServerCount)
		return nil, false, nil
	}

	// Collect all EC volumes grouped by collection
	collections := collectECCollections(nodes, ecConfig)
	if len(collections) == 0 {
		glog.V(1).Infof("EC balance: no EC volumes found matching filters")
		return nil, false, nil
	}

	threshold := ecConfig.ImbalanceThreshold
	var allMoves []*shardMove

	// Build set of allowed collections for global phase filtering
	allowedVids := make(map[uint32]bool)
	for _, volumeIDs := range collections {
		for _, vid := range volumeIDs {
			allowedVids[vid] = true
		}
	}

	for collection, volumeIDs := range collections {
		if ctx != nil {
			if err := ctx.Err(); err != nil {
				return nil, false, err
			}
		}

		// Phase 1: Detect duplicate shards (always run, duplicates are errors not imbalance)
		for _, vid := range volumeIDs {
			moves := detectDuplicateShards(vid, collection, nodes, ecConfig.DiskType)
			applyMovesToTopology(moves)
			allMoves = append(allMoves, moves...)
		}

		// Phase 2: Balance shards across racks (operates on updated topology from phase 1)
		for _, vid := range volumeIDs {
			moves := detectCrossRackImbalance(vid, collection, nodes, racks, ecConfig.DiskType, threshold)
			applyMovesToTopology(moves)
			allMoves = append(allMoves, moves...)
		}

		// Phase 3: Balance shards within racks (operates on updated topology from phases 1-2)
		for _, vid := range volumeIDs {
			moves := detectWithinRackImbalance(vid, collection, nodes, racks, ecConfig.DiskType, threshold)
			applyMovesToTopology(moves)
			allMoves = append(allMoves, moves...)
		}
	}

	// Phase 4: Global node balance across racks (only for volumes in allowed collections)
	globalMoves := detectGlobalImbalance(nodes, racks, ecConfig, allowedVids)
	allMoves = append(allMoves, globalMoves...)

	// Cap results
	hasMore := false
	if maxResults > 0 && len(allMoves) > maxResults {
		allMoves = allMoves[:maxResults]
		hasMore = true
	}

	// Convert moves to TaskDetectionResults
	now := time.Now()
	results := make([]*types.TaskDetectionResult, 0, len(allMoves))
	for i, move := range allMoves {
		// Include loop index and source/target in TaskID for uniqueness
		taskID := fmt.Sprintf("ec_balance_%d_%d_%s_%s_%d_%d",
			move.volumeID, move.shardID,
			move.source.nodeID, move.target.nodeID,
			now.UnixNano(), i)

		result := &types.TaskDetectionResult{
			TaskID:     taskID,
			TaskType:   types.TaskTypeECBalance,
			VolumeID:   move.volumeID,
			Server:     move.source.nodeID,
			Collection: move.collection,
			Priority:   movePhasePriority(move.phase),
			Reason: fmt.Sprintf("EC shard %d.%d %s: %s → %s (%s)",
				move.volumeID, move.shardID, move.phase,
				move.source.nodeID, move.target.nodeID, move.phase),
			ScheduleAt: now,
			TypedParams: &worker_pb.TaskParams{
				TaskId:     taskID,
				VolumeId:   move.volumeID,
				Collection: move.collection,
				Sources: []*worker_pb.TaskSource{{
					Node:     move.source.address,
					DiskId:   move.sourceDisk,
					Rack:     move.source.rack,
					ShardIds: []uint32{uint32(move.shardID)},
				}},
				Targets: []*worker_pb.TaskTarget{{
					Node:     move.target.address,
					DiskId:   move.targetDisk,
					Rack:     move.target.rack,
					ShardIds: []uint32{uint32(move.shardID)},
				}},
				TaskParams: &worker_pb.TaskParams_EcBalanceParams{
					EcBalanceParams: &worker_pb.EcBalanceTaskParams{
						DiskType:       ecConfig.DiskType,
						TimeoutSeconds: 600,
					},
				},
			},
		}
		results = append(results, result)
	}

	glog.V(1).Infof("EC balance detection: %d moves proposed across %d collections",
		len(results), len(collections))

	return results, hasMore, nil
}

// buildECTopology constructs EC node and rack structures from topology info.
// Rack keys are dc:rack composites to avoid cross-DC name collisions.
// Only racks with eligible nodes (matching disk type, having EC shards or capacity) are included.
func buildECTopology(topoInfo *master_pb.TopologyInfo, config *Config) (map[string]*ecNodeInfo, map[string]*ecRackInfo) {
	nodes := make(map[string]*ecNodeInfo)
	racks := make(map[string]*ecRackInfo)

	for _, dc := range topoInfo.DataCenterInfos {
		if config.DataCenterFilter != "" {
			matchers := wildcard.CompileWildcardMatchers(config.DataCenterFilter)
			if !wildcard.MatchesAnyWildcard(matchers, dc.Id) {
				continue
			}
		}

		for _, rack := range dc.RackInfos {
			// Use dc:rack composite key to avoid cross-DC name collisions
			rackKey := dc.Id + ":" + rack.Id

			for _, dn := range rack.DataNodeInfos {
				node := &ecNodeInfo{
					nodeID:   dn.Id,
					address:  dn.Id,
					dc:       dc.Id,
					rack:     rackKey,
					ecShards: make(map[uint32]*ecVolumeInfo),
				}

				hasMatchingDisk := false
				for diskType, diskInfo := range dn.DiskInfos {
					if config.DiskType != "" && diskType != config.DiskType {
						continue
					}
					hasMatchingDisk = true

					freeSlots := int(diskInfo.MaxVolumeCount-diskInfo.VolumeCount)*erasure_coding.DataShardsCount - countEcShards(diskInfo.EcShardInfos)
					if freeSlots > 0 {
						node.freeSlots += freeSlots
					}

					for _, ecShardInfo := range diskInfo.EcShardInfos {
						vid := ecShardInfo.Id
						existing, ok := node.ecShards[vid]
						if !ok {
							existing = &ecVolumeInfo{
								collection: ecShardInfo.Collection,
								diskID:     ecShardInfo.DiskId,
							}
							node.ecShards[vid] = existing
						}
						existing.shardBits |= ecShardInfo.EcIndexBits
					}
				}

				if !hasMatchingDisk {
					continue
				}

				nodes[dn.Id] = node

				// Only create rack entry when we have an eligible node
				if _, ok := racks[rackKey]; !ok {
					racks[rackKey] = &ecRackInfo{nodes: make(map[string]*ecNodeInfo)}
				}
				racks[rackKey].nodes[dn.Id] = node
				racks[rackKey].freeSlots += node.freeSlots
			}
		}
	}

	return nodes, racks
}

// collectECCollections groups EC volume IDs by collection, applying filters
func collectECCollections(nodes map[string]*ecNodeInfo, config *Config) map[string][]uint32 {
	allowedCollections := wildcard.CompileWildcardMatchers(config.CollectionFilter)

	// Collect unique volume IDs per collection
	collectionVids := make(map[string]map[uint32]bool)
	for _, node := range nodes {
		for vid, info := range node.ecShards {
			if len(allowedCollections) > 0 && !wildcard.MatchesAnyWildcard(allowedCollections, info.collection) {
				continue
			}
			if _, ok := collectionVids[info.collection]; !ok {
				collectionVids[info.collection] = make(map[uint32]bool)
			}
			collectionVids[info.collection][vid] = true
		}
	}

	// Convert to sorted slices
	result := make(map[string][]uint32, len(collectionVids))
	for collection, vids := range collectionVids {
		vidSlice := make([]uint32, 0, len(vids))
		for vid := range vids {
			vidSlice = append(vidSlice, vid)
		}
		sort.Slice(vidSlice, func(i, j int) bool { return vidSlice[i] < vidSlice[j] })
		result[collection] = vidSlice
	}

	return result
}

// detectDuplicateShards finds shards that exist on multiple nodes.
// Duplicates are always returned regardless of threshold since they are data errors.
func detectDuplicateShards(vid uint32, collection string, nodes map[string]*ecNodeInfo, diskType string) []*shardMove {
	// Build shard -> list of nodes mapping
	shardLocations := make(map[int][]*ecNodeInfo)
	for _, node := range nodes {
		info, ok := node.ecShards[vid]
		if !ok {
			continue
		}
		for shardID := 0; shardID < erasure_coding.MaxShardCount; shardID++ {
			if info.shardBits&(1<<uint(shardID)) != 0 {
				shardLocations[shardID] = append(shardLocations[shardID], node)
			}
		}
	}

	var moves []*shardMove
	for shardID, locs := range shardLocations {
		if len(locs) <= 1 {
			continue
		}
		// Keep the copy on the node with most free slots (ascending sort, keep last)
		sort.Slice(locs, func(i, j int) bool { return locs[i].freeSlots < locs[j].freeSlots })

		// Propose deletion of all other copies (skip the keeper at the end).
		// Set target=source so isDedupPhase() recognizes this as unmount+delete only.
		for _, node := range locs[:len(locs)-1] {
			moves = append(moves, &shardMove{
				volumeID:   vid,
				shardID:    shardID,
				collection: collection,
				source:     node,
				sourceDisk: ecShardDiskID(node, vid),
				target:     node,
				targetDisk: ecShardDiskID(node, vid),
				phase:      "dedup",
			})
		}
	}

	return moves
}

// detectCrossRackImbalance detects shards that should be moved across racks for even distribution.
// Returns nil if imbalance is below the threshold.
func detectCrossRackImbalance(vid uint32, collection string, nodes map[string]*ecNodeInfo, racks map[string]*ecRackInfo, diskType string, threshold float64) []*shardMove {
	numRacks := len(racks)
	if numRacks <= 1 {
		return nil
	}

	// Count shards per rack for this volume
	rackShardCount := make(map[string]int)
	rackShardNodes := make(map[string][]*ecNodeInfo)
	totalShards := 0

	for _, node := range nodes {
		info, ok := node.ecShards[vid]
		if !ok {
			continue
		}
		count := shardBitCount(info.shardBits)
		if count > 0 {
			rackShardCount[node.rack] += count
			rackShardNodes[node.rack] = append(rackShardNodes[node.rack], node)
			totalShards += count
		}
	}

	if totalShards == 0 {
		return nil
	}

	// Check if imbalance exceeds threshold
	if !exceedsImbalanceThreshold(rackShardCount, totalShards, numRacks, threshold) {
		return nil
	}

	maxPerRack := ceilDivide(totalShards, numRacks)

	var moves []*shardMove

	// Find over-loaded racks and move excess shards to under-loaded racks
	for rackID, count := range rackShardCount {
		if count <= maxPerRack {
			continue
		}
		excess := count - maxPerRack
		movedFromRack := 0

		// Find shards to move from this rack
		for _, node := range rackShardNodes[rackID] {
			if movedFromRack >= excess {
				break
			}
			info := node.ecShards[vid]
			for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ {
				if movedFromRack >= excess {
					break
				}
				if info.shardBits&(1<<uint(shardID)) == 0 {
					continue
				}

				// Find destination: rack with fewest shards of this volume
				destNode := findDestNodeInUnderloadedRack(vid, racks, rackShardCount, maxPerRack, rackID, nodes)
				if destNode == nil {
					continue
				}

				moves = append(moves, &shardMove{
					volumeID:   vid,
					shardID:    shardID,
					collection: collection,
					source:     node,
					sourceDisk: ecShardDiskID(node, vid),
					target:     destNode,
					targetDisk: ecShardDiskID(destNode, vid),
					phase:      "cross_rack",
				})
				movedFromRack++

				// Reserve capacity on destination so it isn't picked again
				rackShardCount[destNode.rack]++
				rackShardCount[rackID]--
				destNode.freeSlots--
			}
		}
	}

	return moves
}

// detectWithinRackImbalance detects shards that should be moved within racks for even node distribution.
// Returns nil if imbalance is below the threshold.
func detectWithinRackImbalance(vid uint32, collection string, nodes map[string]*ecNodeInfo, racks map[string]*ecRackInfo, diskType string, threshold float64) []*shardMove {
	var moves []*shardMove

	for _, rack := range racks {
		if len(rack.nodes) <= 1 {
			continue
		}

		// Count shards per node in this rack for this volume
		nodeShardCount := make(map[string]int)
		totalInRack := 0
		for nodeID, node := range rack.nodes {
			info, ok := node.ecShards[vid]
			if !ok {
				continue
			}
			count := shardBitCount(info.shardBits)
			nodeShardCount[nodeID] = count
			totalInRack += count
		}

		if totalInRack == 0 {
			continue
		}

		// Check if imbalance exceeds threshold
		if !exceedsImbalanceThreshold(nodeShardCount, totalInRack, len(rack.nodes), threshold) {
			continue
		}

		maxPerNode := ceilDivide(totalInRack, len(rack.nodes))

		// Find over-loaded nodes and move excess
		for nodeID, count := range nodeShardCount {
			if count <= maxPerNode {
				continue
			}
			excess := count - maxPerNode
			node := rack.nodes[nodeID]
			info := node.ecShards[vid]
			moved := 0

			for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ {
				if moved >= excess {
					break
				}
				if info.shardBits&(1<<uint(shardID)) == 0 {
					continue
				}

				// Find least-loaded node in same rack
				destNode := findLeastLoadedNodeInRack(vid, rack, nodeID, nodeShardCount, maxPerNode)
				if destNode == nil {
					continue
				}

				moves = append(moves, &shardMove{
					volumeID:   vid,
					shardID:    shardID,
					collection: collection,
					source:     node,
					sourceDisk: ecShardDiskID(node, vid),
					target:     destNode,
					targetDisk: 0,
					phase:      "within_rack",
				})
				moved++
				nodeShardCount[nodeID]--
				nodeShardCount[destNode.nodeID]++
				destNode.freeSlots--
			}
		}
	}

	return moves
}

// detectGlobalImbalance detects total shard count imbalance across nodes in each rack.
// Respects ImbalanceThreshold from config. Only considers volumes in allowedVids.
func detectGlobalImbalance(nodes map[string]*ecNodeInfo, racks map[string]*ecRackInfo, config *Config, allowedVids map[uint32]bool) []*shardMove {
	var moves []*shardMove

	for _, rack := range racks {
		if len(rack.nodes) <= 1 {
			continue
		}

		// Count total EC shards per node (only for allowed volumes)
		nodeShardCounts := make(map[string]int)
		totalShards := 0
		for nodeID, node := range rack.nodes {
			count := 0
			for vid, info := range node.ecShards {
				if len(allowedVids) > 0 && !allowedVids[vid] {
					continue
				}
				count += shardBitCount(info.shardBits)
			}
			nodeShardCounts[nodeID] = count
			totalShards += count
		}

		if totalShards == 0 {
			continue
		}

		// Check if imbalance exceeds threshold
		if !exceedsImbalanceThreshold(nodeShardCounts, totalShards, len(rack.nodes), config.ImbalanceThreshold) {
			continue
		}

		avgShards := ceilDivide(totalShards, len(rack.nodes))

		// Iteratively move shards from most-loaded to least-loaded
		for i := 0; i < 10; i++ { // cap iterations to avoid infinite loops
			// Find min and max nodes, skipping full nodes for min
			var minNode, maxNode *ecNodeInfo
			minCount, maxCount := totalShards+1, -1
			for nodeID, count := range nodeShardCounts {
				node := rack.nodes[nodeID]
				if count < minCount && node.freeSlots > 0 {
					minCount = count
					minNode = node
				}
				if count > maxCount {
					maxCount = count
					maxNode = rack.nodes[nodeID]
				}
			}

			if maxNode == nil || minNode == nil || maxNode.nodeID == minNode.nodeID {
				break
			}
			if maxCount <= avgShards || minCount+1 > avgShards {
				break
			}
			if maxCount-minCount <= 1 {
				break
			}

			// Pick a shard from maxNode that doesn't already exist on minNode
			moved := false
			for vid, info := range maxNode.ecShards {
				if moved {
					break
				}
				if len(allowedVids) > 0 && !allowedVids[vid] {
					continue
				}
				// Check minNode doesn't have this volume's shards already (avoid same-volume overlap)
				minInfo := minNode.ecShards[vid]
				for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ {
					if info.shardBits&(1<<uint(shardID)) == 0 {
						continue
					}
					// Skip if destination already has this shard
					if minInfo != nil && minInfo.shardBits&(1<<uint(shardID)) != 0 {
						continue
					}

					moves = append(moves, &shardMove{
						volumeID:   vid,
						shardID:    shardID,
						collection: info.collection,
						source:     maxNode,
						sourceDisk: info.diskID,
						target:     minNode,
						targetDisk: 0,
						phase:      "global",
					})
					nodeShardCounts[maxNode.nodeID]--
					nodeShardCounts[minNode.nodeID]++
					minNode.freeSlots--
					moved = true
					break
				}
			}
			if !moved {
				break
			}
		}
	}

	return moves
}

// findDestNodeInUnderloadedRack finds a node in a rack that has fewer than maxPerRack shards
func findDestNodeInUnderloadedRack(vid uint32, racks map[string]*ecRackInfo, rackShardCount map[string]int, maxPerRack int, excludeRack string, nodes map[string]*ecNodeInfo) *ecNodeInfo {
	var bestNode *ecNodeInfo
	bestFreeSlots := -1

	for rackID, rack := range racks {
		if rackID == excludeRack {
			continue
		}
		if rackShardCount[rackID] >= maxPerRack {
			continue
		}
		if rack.freeSlots <= 0 {
			continue
		}
		for _, node := range rack.nodes {
			if node.freeSlots <= 0 {
				continue
			}
			if node.freeSlots > bestFreeSlots {
				bestFreeSlots = node.freeSlots
				bestNode = node
			}
		}
	}

	return bestNode
}

// findLeastLoadedNodeInRack finds the node with fewest shards in a rack
func findLeastLoadedNodeInRack(vid uint32, rack *ecRackInfo, excludeNode string, nodeShardCount map[string]int, maxPerNode int) *ecNodeInfo {
	var bestNode *ecNodeInfo
	bestCount := maxPerNode + 1

	for nodeID, node := range rack.nodes {
		if nodeID == excludeNode {
			continue
		}
		if node.freeSlots <= 0 {
			continue
		}
		count := nodeShardCount[nodeID]
		if count >= maxPerNode {
			continue
		}
		if count < bestCount {
			bestCount = count
			bestNode = node
		}
	}

	return bestNode
}

// exceedsImbalanceThreshold checks if the distribution of counts exceeds the threshold.
// numGroups is the total number of groups (including those with 0 shards that aren't in the map).
// imbalanceRatio = (maxCount - minCount) / avgCount
func exceedsImbalanceThreshold(counts map[string]int, total int, numGroups int, threshold float64) bool {
	if numGroups <= 1 || total == 0 {
		return false
	}

	minCount := 0 // groups not in map have 0 shards
	if len(counts) >= numGroups {
		// All groups have entries; find actual min
		minCount = total + 1
		for _, count := range counts {
			if count < minCount {
				minCount = count
			}
		}
	}

	maxCount := -1
	for _, count := range counts {
		if count > maxCount {
			maxCount = count
		}
	}

	avg := float64(total) / float64(numGroups)
	if avg == 0 {
		return false
	}

	imbalanceRatio := float64(maxCount-minCount) / avg
	return imbalanceRatio > threshold
}

// applyMovesToTopology simulates planned moves on the in-memory topology
// so subsequent detection phases see updated shard placement.
func applyMovesToTopology(moves []*shardMove) {
	for _, move := range moves {
		shardBit := uint32(1 << uint(move.shardID))

		// Remove shard from source
		if srcInfo, ok := move.source.ecShards[move.volumeID]; ok {
			srcInfo.shardBits &^= shardBit
		}

		// For non-dedup moves, add shard to target
		if move.source.nodeID != move.target.nodeID {
			dstInfo, ok := move.target.ecShards[move.volumeID]
			if !ok {
				dstInfo = &ecVolumeInfo{
					collection: move.collection,
					diskID:     move.targetDisk,
				}
				move.target.ecShards[move.volumeID] = dstInfo
			}
			dstInfo.shardBits |= shardBit
		}
	}
}

// Helper functions

func countEcShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) int {
	count := 0
	for _, eci := range ecShardInfos {
		count += erasure_coding.GetShardCount(eci)
	}
	return count
}

func shardBitCount(bits uint32) int {
	count := 0
	for bits != 0 {
		count += int(bits & 1)
		bits >>= 1
	}
	return count
}

func ecShardDiskID(node *ecNodeInfo, vid uint32) uint32 {
	if info, ok := node.ecShards[vid]; ok {
		return info.diskID
	}
	return 0
}

func ceilDivide(a, b int) int {
	if b == 0 {
		return 0
	}
	return (a + b - 1) / b
}

func movePhasePriority(phase string) types.TaskPriority {
	switch phase {
	case "dedup":
		return types.TaskPriorityHigh
	case "cross_rack":
		return types.TaskPriorityMedium
	default:
		return types.TaskPriorityLow
	}
}