feat(balance): replica placement validation for volume moves (#8622)
* feat(balance): add replica placement validation for volume moves When the volume balance detection proposes moving a volume, validate that the move does not violate the volume's replication policy (e.g., ReplicaPlacement=010 requires replicas on different racks). If the preferred destination violates the policy, fall back to score-based planning; if that also violates, skip the volume entirely. - Add ReplicaLocation type and VolumeReplicaMap to ClusterInfo - Build replica map from all volumes before collection filtering - Port placement validation logic from command_volume_fix_replication.go - Thread replica map through collectVolumeMetrics call chain - Add IsGoodMove check in createBalanceTask before destination use * address PR review: extract validation closure, add defensive checks - Extract validateMove closure to eliminate duplicated ReplicaLocation construction and IsGoodMove calls - Add defensive check for empty replica map entries (len(replicas) == 0) - Add bounds check for int-to-byte cast on ExpectedReplicas (0-255) * address nitpick: rp test helper accepts *testing.T and fails on error Prevents silent failures from typos in replica placement codes. * address review: add composite replica placement tests (011, 110) Test multi-constraint placement policies where both rack and DC rules must be satisfied simultaneously. * address review: use struct keys instead of string concatenation Replace string-concatenated map keys with typed rackKey/nodeKey structs to eliminate allocations and avoid ambiguity if IDs contain spaces. * address review: simplify bounds check, log fallback error, guard source - Remove unreachable ExpectedReplicas < 0 branch (outer condition already guarantees > 0), fold bounds check into single condition - Log error from planBalanceDestination in replica validation fallback - Return false from IsGoodMove when sourceNodeID not found in existing replicas (inconsistent cluster state) * address review: use slices.Contains instead of hand-rolled helpers Replace isAmongDC and isAmongRack with slices.Contains from the standard library, reducing boilerplate.
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/util"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
||||
@@ -389,6 +390,46 @@ func createBalanceTask(diskType string, selectedVolume *types.VolumeHealthMetric
|
||||
}
|
||||
}
|
||||
|
||||
// Validate move against replica placement policy
|
||||
if selectedVolume.ExpectedReplicas > 0 && selectedVolume.ExpectedReplicas <= 255 && clusterInfo.VolumeReplicaMap != nil {
|
||||
rpBytes, rpErr := super_block.NewReplicaPlacementFromByte(byte(selectedVolume.ExpectedReplicas))
|
||||
if rpErr == nil && rpBytes.HasReplication() {
|
||||
replicas := clusterInfo.VolumeReplicaMap[selectedVolume.VolumeID]
|
||||
if len(replicas) == 0 {
|
||||
glog.V(1).Infof("BALANCE [%s]: No replica locations found for volume %d, skipping placement validation",
|
||||
diskType, selectedVolume.VolumeID)
|
||||
} else {
|
||||
validateMove := func(plan *topology.DestinationPlan) bool {
|
||||
if plan == nil {
|
||||
return false
|
||||
}
|
||||
target := types.ReplicaLocation{
|
||||
DataCenter: plan.TargetDC,
|
||||
Rack: plan.TargetRack,
|
||||
NodeID: plan.TargetNode,
|
||||
}
|
||||
return IsGoodMove(rpBytes, replicas, selectedVolume.Server, target)
|
||||
}
|
||||
|
||||
if !validateMove(destinationPlan) {
|
||||
glog.V(1).Infof("BALANCE [%s]: Destination %s violates replica placement for volume %d (rp=%03d), falling back",
|
||||
diskType, destinationPlan.TargetNode, selectedVolume.VolumeID, selectedVolume.ExpectedReplicas)
|
||||
// Fall back to score-based planning
|
||||
destinationPlan, err = planBalanceDestination(clusterInfo.ActiveTopology, selectedVolume)
|
||||
if err != nil {
|
||||
glog.Warningf("BALANCE [%s]: Failed to plan fallback destination for volume %d: %v", diskType, selectedVolume.VolumeID, err)
|
||||
return nil, ""
|
||||
}
|
||||
if !validateMove(destinationPlan) {
|
||||
glog.V(1).Infof("BALANCE [%s]: Fallback destination %s also violates replica placement for volume %d",
|
||||
diskType, destinationPlan.TargetNode, selectedVolume.VolumeID)
|
||||
return nil, ""
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find the actual disk containing the volume on the source server
|
||||
sourceDisk, found := base.FindVolumeDisk(clusterInfo.ActiveTopology, selectedVolume.VolumeID, selectedVolume.Collection, selectedVolume.Server)
|
||||
if !found {
|
||||
|
||||
Reference in New Issue
Block a user