* feat(balance): add replica placement validation for volume moves When the volume balance detection proposes moving a volume, validate that the move does not violate the volume's replication policy (e.g., ReplicaPlacement=010 requires replicas on different racks). If the preferred destination violates the policy, fall back to score-based planning; if that also violates, skip the volume entirely. - Add ReplicaLocation type and VolumeReplicaMap to ClusterInfo - Build replica map from all volumes before collection filtering - Port placement validation logic from command_volume_fix_replication.go - Thread replica map through collectVolumeMetrics call chain - Add IsGoodMove check in createBalanceTask before destination use * address PR review: extract validation closure, add defensive checks - Extract validateMove closure to eliminate duplicated ReplicaLocation construction and IsGoodMove calls - Add defensive check for empty replica map entries (len(replicas) == 0) - Add bounds check for int-to-byte cast on ExpectedReplicas (0-255) * address nitpick: rp test helper accepts *testing.T and fails on error Prevents silent failures from typos in replica placement codes. * address review: add composite replica placement tests (011, 110) Test multi-constraint placement policies where both rack and DC rules must be satisfied simultaneously. * address review: use struct keys instead of string concatenation Replace string-concatenated map keys with typed rackKey/nodeKey structs to eliminate allocations and avoid ambiguity if IDs contain spaces. * address review: simplify bounds check, log fallback error, guard source - Remove unreachable ExpectedReplicas < 0 branch (outer condition already guarantees > 0), fold bounds check into single condition - Log error from planBalanceDestination in replica validation fallback - Return false from IsGoodMove when sourceNodeID not found in existing replicas (inconsistent cluster state) * address review: use slices.Contains instead of hand-rolled helpers Replace isAmongDC and isAmongRack with slices.Contains from the standard library, reducing boilerplate.
669 lines
25 KiB
Go
669 lines
25 KiB
Go
package balance
|
||
|
||
import (
|
||
"fmt"
|
||
"math"
|
||
"sort"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
|
||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
||
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
|
||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/util"
|
||
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
||
)
|
||
|
||
// Detection implements the detection logic for balance tasks.
|
||
// maxResults limits how many balance operations are returned per invocation.
|
||
// A non-positive maxResults means no explicit limit (uses a large default).
|
||
// The returned truncated flag is true when detection stopped because it hit
|
||
// maxResults rather than running out of work.
|
||
func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterInfo, config base.TaskConfig, maxResults int) ([]*types.TaskDetectionResult, bool, error) {
|
||
if !config.IsEnabled() {
|
||
return nil, false, nil
|
||
}
|
||
if clusterInfo == nil {
|
||
return nil, false, nil
|
||
}
|
||
|
||
balanceConfig := config.(*Config)
|
||
|
||
if maxResults <= 0 {
|
||
maxResults = math.MaxInt32
|
||
}
|
||
|
||
// Group volumes by disk type to ensure we compare apples to apples
|
||
volumesByDiskType := make(map[string][]*types.VolumeHealthMetrics)
|
||
for _, metric := range metrics {
|
||
volumesByDiskType[metric.DiskType] = append(volumesByDiskType[metric.DiskType], metric)
|
||
}
|
||
|
||
// Sort disk types for deterministic iteration order when maxResults
|
||
// spans multiple disk types.
|
||
diskTypes := make([]string, 0, len(volumesByDiskType))
|
||
for dt := range volumesByDiskType {
|
||
diskTypes = append(diskTypes, dt)
|
||
}
|
||
sort.Strings(diskTypes)
|
||
|
||
var allParams []*types.TaskDetectionResult
|
||
truncated := false
|
||
|
||
for _, diskType := range diskTypes {
|
||
remaining := maxResults - len(allParams)
|
||
if remaining <= 0 {
|
||
truncated = true
|
||
break
|
||
}
|
||
tasks, diskTruncated := detectForDiskType(diskType, volumesByDiskType[diskType], balanceConfig, clusterInfo, remaining)
|
||
allParams = append(allParams, tasks...)
|
||
if diskTruncated {
|
||
truncated = true
|
||
}
|
||
}
|
||
|
||
return allParams, truncated, nil
|
||
}
|
||
|
||
// detectForDiskType performs balance detection for a specific disk type,
|
||
// returning up to maxResults balance tasks and whether it was truncated by the limit.
|
||
func detectForDiskType(diskType string, diskMetrics []*types.VolumeHealthMetrics, balanceConfig *Config, clusterInfo *types.ClusterInfo, maxResults int) ([]*types.TaskDetectionResult, bool) {
|
||
// Skip if cluster segment is too small
|
||
minVolumeCount := 2 // More reasonable for small clusters
|
||
if len(diskMetrics) < minVolumeCount {
|
||
// Only log at verbose level to avoid spamming for small/empty disk types
|
||
glog.V(1).Infof("BALANCE [%s]: No tasks created - cluster too small (%d volumes, need ≥%d)", diskType, len(diskMetrics), minVolumeCount)
|
||
return nil, false
|
||
}
|
||
|
||
// Analyze volume distribution across servers.
|
||
// Seed from ActiveTopology so servers with matching disk type but zero
|
||
// volumes are included in the count and imbalance calculation.
|
||
// Also collect MaxVolumeCount per server to compute utilization ratios.
|
||
serverVolumeCounts := make(map[string]int)
|
||
serverMaxVolumes := make(map[string]int64)
|
||
if clusterInfo.ActiveTopology != nil {
|
||
topologyInfo := clusterInfo.ActiveTopology.GetTopologyInfo()
|
||
if topologyInfo != nil {
|
||
rackFilterSet := util.ParseCSVSet(balanceConfig.RackFilter)
|
||
nodeFilterSet := util.ParseCSVSet(balanceConfig.NodeFilter)
|
||
dcFilter := strings.TrimSpace(balanceConfig.DataCenterFilter)
|
||
for _, dc := range topologyInfo.DataCenterInfos {
|
||
if dcFilter != "" && dc.Id != dcFilter {
|
||
continue
|
||
}
|
||
for _, rack := range dc.RackInfos {
|
||
if rackFilterSet != nil && !rackFilterSet[rack.Id] {
|
||
continue
|
||
}
|
||
for _, node := range rack.DataNodeInfos {
|
||
if nodeFilterSet != nil && !nodeFilterSet[node.Id] {
|
||
continue
|
||
}
|
||
for diskTypeName, diskInfo := range node.DiskInfos {
|
||
if diskTypeName == diskType {
|
||
serverVolumeCounts[node.Id] = 0
|
||
serverMaxVolumes[node.Id] += diskInfo.MaxVolumeCount
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
hasLocationFilter := balanceConfig.DataCenterFilter != "" || balanceConfig.RackFilter != "" || balanceConfig.NodeFilter != ""
|
||
for _, metric := range diskMetrics {
|
||
if hasLocationFilter {
|
||
// Only count metrics for servers that passed filtering.
|
||
// Without this guard, out-of-scope servers are re-introduced.
|
||
if _, allowed := serverVolumeCounts[metric.Server]; !allowed {
|
||
continue
|
||
}
|
||
}
|
||
serverVolumeCounts[metric.Server]++
|
||
}
|
||
|
||
if len(serverVolumeCounts) < balanceConfig.MinServerCount {
|
||
glog.V(1).Infof("BALANCE [%s]: No tasks created - too few servers (%d servers, need ≥%d)", diskType, len(serverVolumeCounts), balanceConfig.MinServerCount)
|
||
return nil, false
|
||
}
|
||
|
||
// Seed adjustments from existing pending/assigned balance tasks so that
|
||
// effectiveCounts reflects in-flight moves and prevents over-scheduling.
|
||
var adjustments map[string]int
|
||
if clusterInfo.ActiveTopology != nil {
|
||
adjustments = clusterInfo.ActiveTopology.GetTaskServerAdjustments(topology.TaskTypeBalance)
|
||
}
|
||
if adjustments == nil {
|
||
adjustments = make(map[string]int)
|
||
}
|
||
// Servers where we can no longer find eligible volumes or plan destinations
|
||
exhaustedServers := make(map[string]bool)
|
||
|
||
// Sort servers for deterministic iteration and tie-breaking
|
||
sortedServers := make([]string, 0, len(serverVolumeCounts))
|
||
for server := range serverVolumeCounts {
|
||
sortedServers = append(sortedServers, server)
|
||
}
|
||
sort.Strings(sortedServers)
|
||
|
||
// Pre-index volumes by server with cursors to avoid O(maxResults * volumes) scanning.
|
||
// Sort each server's volumes by VolumeID for deterministic selection.
|
||
volumesByServer := make(map[string][]*types.VolumeHealthMetrics, len(serverVolumeCounts))
|
||
for _, metric := range diskMetrics {
|
||
volumesByServer[metric.Server] = append(volumesByServer[metric.Server], metric)
|
||
}
|
||
for _, vols := range volumesByServer {
|
||
sort.Slice(vols, func(i, j int) bool {
|
||
return vols[i].VolumeID < vols[j].VolumeID
|
||
})
|
||
}
|
||
serverCursors := make(map[string]int, len(serverVolumeCounts))
|
||
|
||
var results []*types.TaskDetectionResult
|
||
balanced := false
|
||
|
||
// Decide upfront whether all servers have MaxVolumeCount info.
|
||
// If any server is missing it, fall back to raw counts for ALL servers
|
||
// to avoid mixing utilization ratios (0.0–1.0) with raw counts.
|
||
allServersHaveMaxInfo := true
|
||
for _, server := range sortedServers {
|
||
if maxVol, ok := serverMaxVolumes[server]; !ok || maxVol <= 0 {
|
||
allServersHaveMaxInfo = false
|
||
glog.V(1).Infof("BALANCE [%s]: Server %s is missing MaxVolumeCount info, falling back to raw volume counts for balancing", diskType, server)
|
||
break
|
||
}
|
||
}
|
||
|
||
var serverUtilization func(server string, effectiveCount int) float64
|
||
if allServersHaveMaxInfo {
|
||
serverUtilization = func(server string, effectiveCount int) float64 {
|
||
return float64(effectiveCount) / float64(serverMaxVolumes[server])
|
||
}
|
||
} else {
|
||
serverUtilization = func(_ string, effectiveCount int) float64 {
|
||
return float64(effectiveCount)
|
||
}
|
||
}
|
||
|
||
for len(results) < maxResults {
|
||
// Compute effective volume counts with adjustments from planned moves
|
||
effectiveCounts := make(map[string]int, len(serverVolumeCounts))
|
||
for server, count := range serverVolumeCounts {
|
||
effective := count + adjustments[server]
|
||
if effective < 0 {
|
||
effective = 0
|
||
}
|
||
effectiveCounts[server] = effective
|
||
}
|
||
|
||
// Find the most and least utilized servers using utilization ratio
|
||
// (volumes / maxVolumes) so that servers with higher capacity are
|
||
// expected to hold proportionally more volumes.
|
||
maxUtilization := -1.0
|
||
minUtilization := math.Inf(1)
|
||
maxServer := ""
|
||
minServer := ""
|
||
|
||
for _, server := range sortedServers {
|
||
count := effectiveCounts[server]
|
||
util := serverUtilization(server, count)
|
||
// Min is calculated across all servers for an accurate imbalance ratio
|
||
if util < minUtilization {
|
||
minUtilization = util
|
||
minServer = server
|
||
}
|
||
// Max is only among non-exhausted servers since we can only move from them
|
||
if exhaustedServers[server] {
|
||
continue
|
||
}
|
||
if util > maxUtilization {
|
||
maxUtilization = util
|
||
maxServer = server
|
||
}
|
||
}
|
||
|
||
if maxServer == "" {
|
||
// All servers exhausted
|
||
glog.V(1).Infof("BALANCE [%s]: All overloaded servers exhausted after %d task(s)", diskType, len(results))
|
||
break
|
||
}
|
||
|
||
// Check if utilization imbalance exceeds threshold.
|
||
// imbalanceRatio is the difference between the most and least utilized
|
||
// servers, expressed as a fraction of mean utilization.
|
||
avgUtilization := (maxUtilization + minUtilization) / 2.0
|
||
var imbalanceRatio float64
|
||
if avgUtilization > 0 {
|
||
imbalanceRatio = (maxUtilization - minUtilization) / avgUtilization
|
||
}
|
||
if imbalanceRatio <= balanceConfig.ImbalanceThreshold {
|
||
if len(results) == 0 {
|
||
glog.Infof("BALANCE [%s]: No tasks created - cluster well balanced. Imbalance=%.1f%% (threshold=%.1f%%). MaxUtil=%.1f%% on %s, MinUtil=%.1f%% on %s",
|
||
diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, maxUtilization*100, maxServer, minUtilization*100, minServer)
|
||
} else {
|
||
glog.Infof("BALANCE [%s]: Created %d task(s), cluster now balanced. Imbalance=%.1f%% (threshold=%.1f%%)",
|
||
diskType, len(results), imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100)
|
||
}
|
||
balanced = true
|
||
break
|
||
}
|
||
|
||
// When the global max and min effective counts differ by at most 1,
|
||
// no single move can improve balance — it would just swap which server
|
||
// is min vs max. Stop here to avoid infinite oscillation when the
|
||
// threshold is unachievable (e.g., 11 vols across 4 servers: best is
|
||
// 3/3/3/2, imbalance=36%). We scan ALL servers' effective counts so the
|
||
// check works regardless of whether utilization or raw counts are used.
|
||
globalMaxCount, globalMinCount := 0, math.MaxInt
|
||
for _, c := range effectiveCounts {
|
||
if c > globalMaxCount {
|
||
globalMaxCount = c
|
||
}
|
||
if c < globalMinCount {
|
||
globalMinCount = c
|
||
}
|
||
}
|
||
if globalMaxCount-globalMinCount <= 1 {
|
||
if len(results) == 0 {
|
||
glog.Infof("BALANCE [%s]: No tasks created - cluster as balanced as possible. Imbalance=%.1f%% (threshold=%.1f%%), but max-min diff is %d",
|
||
diskType, imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, globalMaxCount-globalMinCount)
|
||
} else {
|
||
glog.Infof("BALANCE [%s]: Created %d task(s), cluster as balanced as possible. Imbalance=%.1f%% (threshold=%.1f%%), max-min diff=%d",
|
||
diskType, len(results), imbalanceRatio*100, balanceConfig.ImbalanceThreshold*100, globalMaxCount-globalMinCount)
|
||
}
|
||
balanced = true
|
||
break
|
||
}
|
||
|
||
// Select a volume from the overloaded server using per-server cursor
|
||
var selectedVolume *types.VolumeHealthMetrics
|
||
serverVols := volumesByServer[maxServer]
|
||
cursor := serverCursors[maxServer]
|
||
for cursor < len(serverVols) {
|
||
metric := serverVols[cursor]
|
||
cursor++
|
||
// Skip volumes that already have a task in ActiveTopology
|
||
if clusterInfo.ActiveTopology != nil && clusterInfo.ActiveTopology.HasAnyTask(metric.VolumeID) {
|
||
continue
|
||
}
|
||
selectedVolume = metric
|
||
break
|
||
}
|
||
serverCursors[maxServer] = cursor
|
||
|
||
if selectedVolume == nil {
|
||
glog.V(1).Infof("BALANCE [%s]: No more eligible volumes on overloaded server %s, trying other servers", diskType, maxServer)
|
||
exhaustedServers[maxServer] = true
|
||
continue
|
||
}
|
||
|
||
// Create task targeting minServer — the greedy algorithm's natural choice.
|
||
// Using minServer instead of letting planBalanceDestination independently
|
||
// pick a destination ensures that the detection loop's effective counts
|
||
// and the destination selection stay in sync. Without this, the topology's
|
||
// LoadCount-based scoring can diverge from the adjustment-based effective
|
||
// counts, causing moves to pile onto one server or oscillate (A→B, B→A).
|
||
task, destServerID := createBalanceTask(diskType, selectedVolume, clusterInfo, minServer, serverVolumeCounts)
|
||
if task == nil {
|
||
glog.V(1).Infof("BALANCE [%s]: Cannot plan task for volume %d on server %s, trying next volume", diskType, selectedVolume.VolumeID, maxServer)
|
||
continue
|
||
}
|
||
|
||
results = append(results, task)
|
||
|
||
// Adjust effective counts for the next iteration.
|
||
adjustments[maxServer]--
|
||
if destServerID != "" {
|
||
adjustments[destServerID]++
|
||
// If the destination server wasn't in serverVolumeCounts (e.g., a
|
||
// server with 0 volumes not seeded from topology), add it so
|
||
// subsequent iterations include it in effective/average/min/max.
|
||
if _, exists := serverVolumeCounts[destServerID]; !exists {
|
||
serverVolumeCounts[destServerID] = 0
|
||
sortedServers = append(sortedServers, destServerID)
|
||
sort.Strings(sortedServers)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Truncated only if we hit maxResults and detection didn't naturally finish
|
||
truncated := len(results) >= maxResults && !balanced
|
||
return results, truncated
|
||
}
|
||
|
||
// createBalanceTask creates a single balance task for the selected volume.
|
||
// targetServer is the server ID chosen by the detection loop's greedy algorithm.
|
||
// Returns (nil, "") if destination planning fails.
|
||
// On success, returns the task result and the canonical destination server ID.
|
||
// allowedServers is the set of servers that passed DC/rack/node filtering in
|
||
// the detection loop. When non-empty, the fallback destination planner is
|
||
// checked against this set so that filter scope cannot leak.
|
||
func createBalanceTask(diskType string, selectedVolume *types.VolumeHealthMetrics, clusterInfo *types.ClusterInfo, targetServer string, allowedServers map[string]int) (*types.TaskDetectionResult, string) {
|
||
taskID := fmt.Sprintf("balance_vol_%d_%d", selectedVolume.VolumeID, time.Now().UnixNano())
|
||
|
||
task := &types.TaskDetectionResult{
|
||
TaskID: taskID,
|
||
TaskType: types.TaskTypeBalance,
|
||
VolumeID: selectedVolume.VolumeID,
|
||
Server: selectedVolume.Server,
|
||
Collection: selectedVolume.Collection,
|
||
Priority: types.TaskPriorityNormal,
|
||
Reason: fmt.Sprintf("Cluster imbalance detected for %s disk type",
|
||
diskType),
|
||
ScheduleAt: time.Now(),
|
||
}
|
||
|
||
// Plan destination if ActiveTopology is available
|
||
if clusterInfo.ActiveTopology == nil {
|
||
glog.Warningf("No ActiveTopology available for destination planning in balance detection")
|
||
return nil, ""
|
||
}
|
||
|
||
// Resolve the target server chosen by the detection loop's effective counts.
|
||
// This keeps destination selection in sync with the greedy algorithm rather
|
||
// than relying on topology LoadCount which can diverge across iterations.
|
||
destinationPlan, err := resolveBalanceDestination(clusterInfo.ActiveTopology, selectedVolume, targetServer)
|
||
if err != nil {
|
||
// Fall back to score-based planning if the preferred target can't be resolved
|
||
glog.V(1).Infof("BALANCE [%s]: Cannot resolve target %s for volume %d, falling back to score-based planning: %v",
|
||
diskType, targetServer, selectedVolume.VolumeID, err)
|
||
destinationPlan, err = planBalanceDestination(clusterInfo.ActiveTopology, selectedVolume)
|
||
if err != nil {
|
||
glog.Warningf("Failed to plan balance destination for volume %d: %v", selectedVolume.VolumeID, err)
|
||
return nil, ""
|
||
}
|
||
}
|
||
|
||
// Verify the destination is within the filtered scope. When DC/rack/node
|
||
// filters are active, allowedServers contains only the servers that passed
|
||
// filtering. The fallback planner queries the full topology, so this check
|
||
// prevents out-of-scope targets from leaking through.
|
||
if len(allowedServers) > 0 {
|
||
if _, ok := allowedServers[destinationPlan.TargetNode]; !ok {
|
||
glog.V(1).Infof("BALANCE [%s]: Planned destination %s for volume %d is outside filtered scope, skipping",
|
||
diskType, destinationPlan.TargetNode, selectedVolume.VolumeID)
|
||
return nil, ""
|
||
}
|
||
}
|
||
|
||
// Validate move against replica placement policy
|
||
if selectedVolume.ExpectedReplicas > 0 && selectedVolume.ExpectedReplicas <= 255 && clusterInfo.VolumeReplicaMap != nil {
|
||
rpBytes, rpErr := super_block.NewReplicaPlacementFromByte(byte(selectedVolume.ExpectedReplicas))
|
||
if rpErr == nil && rpBytes.HasReplication() {
|
||
replicas := clusterInfo.VolumeReplicaMap[selectedVolume.VolumeID]
|
||
if len(replicas) == 0 {
|
||
glog.V(1).Infof("BALANCE [%s]: No replica locations found for volume %d, skipping placement validation",
|
||
diskType, selectedVolume.VolumeID)
|
||
} else {
|
||
validateMove := func(plan *topology.DestinationPlan) bool {
|
||
if plan == nil {
|
||
return false
|
||
}
|
||
target := types.ReplicaLocation{
|
||
DataCenter: plan.TargetDC,
|
||
Rack: plan.TargetRack,
|
||
NodeID: plan.TargetNode,
|
||
}
|
||
return IsGoodMove(rpBytes, replicas, selectedVolume.Server, target)
|
||
}
|
||
|
||
if !validateMove(destinationPlan) {
|
||
glog.V(1).Infof("BALANCE [%s]: Destination %s violates replica placement for volume %d (rp=%03d), falling back",
|
||
diskType, destinationPlan.TargetNode, selectedVolume.VolumeID, selectedVolume.ExpectedReplicas)
|
||
// Fall back to score-based planning
|
||
destinationPlan, err = planBalanceDestination(clusterInfo.ActiveTopology, selectedVolume)
|
||
if err != nil {
|
||
glog.Warningf("BALANCE [%s]: Failed to plan fallback destination for volume %d: %v", diskType, selectedVolume.VolumeID, err)
|
||
return nil, ""
|
||
}
|
||
if !validateMove(destinationPlan) {
|
||
glog.V(1).Infof("BALANCE [%s]: Fallback destination %s also violates replica placement for volume %d",
|
||
diskType, destinationPlan.TargetNode, selectedVolume.VolumeID)
|
||
return nil, ""
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Find the actual disk containing the volume on the source server
|
||
sourceDisk, found := base.FindVolumeDisk(clusterInfo.ActiveTopology, selectedVolume.VolumeID, selectedVolume.Collection, selectedVolume.Server)
|
||
if !found {
|
||
glog.Warningf("BALANCE [%s]: Could not find volume %d (collection: %s) on source server %s - unable to create balance task",
|
||
diskType, selectedVolume.VolumeID, selectedVolume.Collection, selectedVolume.Server)
|
||
return nil, ""
|
||
}
|
||
|
||
// Update reason with full details now that we have destination info
|
||
task.Reason = fmt.Sprintf("Cluster imbalance detected for %s: move volume %d from %s to %s",
|
||
diskType, selectedVolume.VolumeID, selectedVolume.Server, destinationPlan.TargetNode)
|
||
|
||
// Create typed parameters with unified source and target information
|
||
task.TypedParams = &worker_pb.TaskParams{
|
||
TaskId: taskID,
|
||
VolumeId: selectedVolume.VolumeID,
|
||
Collection: selectedVolume.Collection,
|
||
VolumeSize: selectedVolume.Size,
|
||
|
||
Sources: []*worker_pb.TaskSource{
|
||
{
|
||
Node: selectedVolume.ServerAddress,
|
||
DiskId: sourceDisk,
|
||
VolumeId: selectedVolume.VolumeID,
|
||
EstimatedSize: selectedVolume.Size,
|
||
DataCenter: selectedVolume.DataCenter,
|
||
Rack: selectedVolume.Rack,
|
||
},
|
||
},
|
||
Targets: []*worker_pb.TaskTarget{
|
||
{
|
||
Node: destinationPlan.TargetAddress,
|
||
DiskId: destinationPlan.TargetDisk,
|
||
VolumeId: selectedVolume.VolumeID,
|
||
EstimatedSize: destinationPlan.ExpectedSize,
|
||
DataCenter: destinationPlan.TargetDC,
|
||
Rack: destinationPlan.TargetRack,
|
||
},
|
||
},
|
||
|
||
TaskParams: &worker_pb.TaskParams_BalanceParams{
|
||
BalanceParams: &worker_pb.BalanceTaskParams{
|
||
ForceMove: false,
|
||
TimeoutSeconds: 600, // 10 minutes default
|
||
},
|
||
},
|
||
}
|
||
|
||
glog.V(1).Infof("Planned balance destination for volume %d: %s -> %s",
|
||
selectedVolume.VolumeID, selectedVolume.Server, destinationPlan.TargetNode)
|
||
|
||
// Add pending balance task to ActiveTopology for capacity management
|
||
targetDisk := destinationPlan.TargetDisk
|
||
|
||
err = clusterInfo.ActiveTopology.AddPendingTask(topology.TaskSpec{
|
||
TaskID: taskID,
|
||
TaskType: topology.TaskTypeBalance,
|
||
VolumeID: selectedVolume.VolumeID,
|
||
VolumeSize: int64(selectedVolume.Size),
|
||
Sources: []topology.TaskSourceSpec{
|
||
{ServerID: selectedVolume.Server, DiskID: sourceDisk},
|
||
},
|
||
Destinations: []topology.TaskDestinationSpec{
|
||
{ServerID: destinationPlan.TargetNode, DiskID: targetDisk},
|
||
},
|
||
})
|
||
if err != nil {
|
||
glog.Warningf("BALANCE [%s]: Failed to add pending task for volume %d: %v", diskType, selectedVolume.VolumeID, err)
|
||
return nil, ""
|
||
}
|
||
|
||
glog.V(2).Infof("Added pending balance task %s to ActiveTopology for volume %d: %s:%d -> %s:%d",
|
||
taskID, selectedVolume.VolumeID, selectedVolume.Server, sourceDisk, destinationPlan.TargetNode, targetDisk)
|
||
|
||
return task, destinationPlan.TargetNode
|
||
}
|
||
|
||
// resolveBalanceDestination resolves the destination for a balance operation
|
||
// when the target server is already known (chosen by the detection loop's
|
||
// effective volume counts). It finds the appropriate disk and address for the
|
||
// target server in the topology.
|
||
func resolveBalanceDestination(activeTopology *topology.ActiveTopology, selectedVolume *types.VolumeHealthMetrics, targetServer string) (*topology.DestinationPlan, error) {
|
||
topologyInfo := activeTopology.GetTopologyInfo()
|
||
if topologyInfo == nil {
|
||
return nil, fmt.Errorf("no topology info available")
|
||
}
|
||
|
||
// Find the target node in the topology and get its disk info
|
||
for _, dc := range topologyInfo.DataCenterInfos {
|
||
for _, rack := range dc.RackInfos {
|
||
for _, node := range rack.DataNodeInfos {
|
||
if node.Id != targetServer {
|
||
continue
|
||
}
|
||
// Find an available disk matching the volume's disk type
|
||
for diskTypeName, diskInfo := range node.DiskInfos {
|
||
if diskTypeName != selectedVolume.DiskType {
|
||
continue
|
||
}
|
||
if diskInfo.MaxVolumeCount > 0 && diskInfo.VolumeCount >= diskInfo.MaxVolumeCount {
|
||
continue // disk is full
|
||
}
|
||
targetAddress, err := util.ResolveServerAddress(node.Id, activeTopology)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to resolve address for target server %s: %v", node.Id, err)
|
||
}
|
||
return &topology.DestinationPlan{
|
||
TargetNode: node.Id,
|
||
TargetAddress: targetAddress,
|
||
TargetDisk: diskInfo.DiskId,
|
||
TargetRack: rack.Id,
|
||
TargetDC: dc.Id,
|
||
ExpectedSize: selectedVolume.Size,
|
||
}, nil
|
||
}
|
||
return nil, fmt.Errorf("target server %s has no available disk of type %s", targetServer, selectedVolume.DiskType)
|
||
}
|
||
}
|
||
}
|
||
return nil, fmt.Errorf("target server %s not found in topology", targetServer)
|
||
}
|
||
|
||
// planBalanceDestination plans the destination for a balance operation using
|
||
// score-based selection. Used as a fallback when the preferred target cannot
|
||
// be resolved, and for single-move scenarios outside the detection loop.
|
||
func planBalanceDestination(activeTopology *topology.ActiveTopology, selectedVolume *types.VolumeHealthMetrics) (*topology.DestinationPlan, error) {
|
||
// Get source node information from topology
|
||
var sourceRack, sourceDC string
|
||
|
||
// Extract rack and DC from topology info
|
||
topologyInfo := activeTopology.GetTopologyInfo()
|
||
if topologyInfo != nil {
|
||
for _, dc := range topologyInfo.DataCenterInfos {
|
||
for _, rack := range dc.RackInfos {
|
||
for _, dataNodeInfo := range rack.DataNodeInfos {
|
||
if dataNodeInfo.Id == selectedVolume.Server {
|
||
sourceDC = dc.Id
|
||
sourceRack = rack.Id
|
||
break
|
||
}
|
||
}
|
||
if sourceRack != "" {
|
||
break
|
||
}
|
||
}
|
||
if sourceDC != "" {
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
// Get available disks, excluding the source node
|
||
availableDisks := activeTopology.GetAvailableDisks(topology.TaskTypeBalance, selectedVolume.Server)
|
||
if len(availableDisks) == 0 {
|
||
return nil, fmt.Errorf("no available disks for balance operation")
|
||
}
|
||
|
||
// Sort available disks by NodeID then DiskID for deterministic tie-breaking
|
||
sort.Slice(availableDisks, func(i, j int) bool {
|
||
if availableDisks[i].NodeID != availableDisks[j].NodeID {
|
||
return availableDisks[i].NodeID < availableDisks[j].NodeID
|
||
}
|
||
return availableDisks[i].DiskID < availableDisks[j].DiskID
|
||
})
|
||
|
||
// Find the best destination disk based on balance criteria
|
||
var bestDisk *topology.DiskInfo
|
||
bestScore := math.Inf(-1)
|
||
|
||
for _, disk := range availableDisks {
|
||
// Ensure disk type matches
|
||
if disk.DiskType != selectedVolume.DiskType {
|
||
continue
|
||
}
|
||
|
||
score := calculateBalanceScore(disk, sourceRack, sourceDC, selectedVolume.Size)
|
||
if score > bestScore {
|
||
bestScore = score
|
||
bestDisk = disk
|
||
}
|
||
}
|
||
|
||
if bestDisk == nil {
|
||
return nil, fmt.Errorf("no suitable destination found for balance operation")
|
||
}
|
||
|
||
// Get the target server address
|
||
targetAddress, err := util.ResolveServerAddress(bestDisk.NodeID, activeTopology)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to resolve address for target server %s: %v", bestDisk.NodeID, err)
|
||
}
|
||
|
||
return &topology.DestinationPlan{
|
||
TargetNode: bestDisk.NodeID,
|
||
TargetAddress: targetAddress,
|
||
TargetDisk: bestDisk.DiskID,
|
||
TargetRack: bestDisk.Rack,
|
||
TargetDC: bestDisk.DataCenter,
|
||
ExpectedSize: selectedVolume.Size,
|
||
PlacementScore: bestScore,
|
||
}, nil
|
||
}
|
||
|
||
// calculateBalanceScore calculates placement score for balance operations.
|
||
// LoadCount reflects pending+assigned tasks on the disk, so we factor it into
|
||
// the utilization estimate to avoid stacking multiple moves onto the same target.
|
||
func calculateBalanceScore(disk *topology.DiskInfo, sourceRack, sourceDC string, volumeSize uint64) float64 {
|
||
if disk.DiskInfo == nil {
|
||
return 0.0
|
||
}
|
||
|
||
score := 0.0
|
||
|
||
// Prefer disks with lower effective volume count (current + pending moves).
|
||
// LoadCount is included so that disks already targeted by planned moves
|
||
// appear more utilized, naturally spreading work across targets.
|
||
if disk.DiskInfo.MaxVolumeCount > 0 {
|
||
effectiveVolumeCount := float64(disk.DiskInfo.VolumeCount) + float64(disk.LoadCount)
|
||
utilization := effectiveVolumeCount / float64(disk.DiskInfo.MaxVolumeCount)
|
||
score += (1.0 - utilization) * 50.0 // Up to 50 points for low utilization
|
||
}
|
||
|
||
// Prefer different racks for better distribution
|
||
if disk.Rack != sourceRack {
|
||
score += 30.0
|
||
}
|
||
|
||
// Prefer different data centers for better distribution
|
||
if disk.DataCenter != sourceDC {
|
||
score += 20.0
|
||
}
|
||
|
||
return score
|
||
}
|
||
|
||
// parseCSVSet splits a comma-separated string into a set of trimmed, non-empty values.
|