feat(balance): replica placement validation for volume moves (#8622)

* feat(balance): add replica placement validation for volume moves

When the volume balance detection proposes moving a volume, validate
that the move does not violate the volume's replication policy (e.g.,
ReplicaPlacement=010 requires replicas on different racks). If the
preferred destination violates the policy, fall back to score-based
planning; if that also violates, skip the volume entirely.

- Add ReplicaLocation type and VolumeReplicaMap to ClusterInfo
- Build replica map from all volumes before collection filtering
- Port placement validation logic from command_volume_fix_replication.go
- Thread replica map through collectVolumeMetrics call chain
- Add IsGoodMove check in createBalanceTask before destination use

* address PR review: extract validation closure, add defensive checks

- Extract validateMove closure to eliminate duplicated ReplicaLocation
  construction and IsGoodMove calls
- Add defensive check for empty replica map entries (len(replicas) == 0)
- Add bounds check for int-to-byte cast on ExpectedReplicas (0-255)

* address nitpick: rp test helper accepts *testing.T and fails on error

Prevents silent failures from typos in replica placement codes.

* address review: add composite replica placement tests (011, 110)

Test multi-constraint placement policies where both rack and DC
rules must be satisfied simultaneously.

* address review: use struct keys instead of string concatenation

Replace string-concatenated map keys with typed rackKey/nodeKey
structs to eliminate allocations and avoid ambiguity if IDs
contain spaces.

* address review: simplify bounds check, log fallback error, guard source

- Remove unreachable ExpectedReplicas < 0 branch (outer condition
  already guarantees > 0), fold bounds check into single condition
- Log error from planBalanceDestination in replica validation fallback
- Return false from IsGoodMove when sourceNodeID not found in
  existing replicas (inconsistent cluster state)

* address review: use slices.Contains instead of hand-rolled helpers

Replace isAmongDC and isAmongRack with slices.Contains from the
standard library, reducing boilerplate.
This commit is contained in:
Chris Lu
2026-03-13 17:39:25 -07:00
committed by GitHub
parent 47ddf05d95
commit 8056b702ba
9 changed files with 364 additions and 30 deletions

View File

@@ -10,6 +10,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/util"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
@@ -389,6 +390,46 @@ func createBalanceTask(diskType string, selectedVolume *types.VolumeHealthMetric
}
}
// Validate move against replica placement policy
if selectedVolume.ExpectedReplicas > 0 && selectedVolume.ExpectedReplicas <= 255 && clusterInfo.VolumeReplicaMap != nil {
rpBytes, rpErr := super_block.NewReplicaPlacementFromByte(byte(selectedVolume.ExpectedReplicas))
if rpErr == nil && rpBytes.HasReplication() {
replicas := clusterInfo.VolumeReplicaMap[selectedVolume.VolumeID]
if len(replicas) == 0 {
glog.V(1).Infof("BALANCE [%s]: No replica locations found for volume %d, skipping placement validation",
diskType, selectedVolume.VolumeID)
} else {
validateMove := func(plan *topology.DestinationPlan) bool {
if plan == nil {
return false
}
target := types.ReplicaLocation{
DataCenter: plan.TargetDC,
Rack: plan.TargetRack,
NodeID: plan.TargetNode,
}
return IsGoodMove(rpBytes, replicas, selectedVolume.Server, target)
}
if !validateMove(destinationPlan) {
glog.V(1).Infof("BALANCE [%s]: Destination %s violates replica placement for volume %d (rp=%03d), falling back",
diskType, destinationPlan.TargetNode, selectedVolume.VolumeID, selectedVolume.ExpectedReplicas)
// Fall back to score-based planning
destinationPlan, err = planBalanceDestination(clusterInfo.ActiveTopology, selectedVolume)
if err != nil {
glog.Warningf("BALANCE [%s]: Failed to plan fallback destination for volume %d: %v", diskType, selectedVolume.VolumeID, err)
return nil, ""
}
if !validateMove(destinationPlan) {
glog.V(1).Infof("BALANCE [%s]: Fallback destination %s also violates replica placement for volume %d",
diskType, destinationPlan.TargetNode, selectedVolume.VolumeID)
return nil, ""
}
}
}
}
}
// Find the actual disk containing the volume on the source server
sourceDisk, found := base.FindVolumeDisk(clusterInfo.ActiveTopology, selectedVolume.VolumeID, selectedVolume.Collection, selectedVolume.Server)
if !found {

View File

@@ -0,0 +1,146 @@
package balance
import (
"slices"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
// rackKey uniquely identifies a rack within a data center.
type rackKey struct {
DataCenter string
Rack string
}
// nodeKey uniquely identifies a node within a rack.
type nodeKey struct {
DataCenter string
Rack string
NodeID string
}
// IsGoodMove checks whether moving a volume from sourceNodeID to target
// would satisfy the volume's replica placement policy, given the current
// set of replica locations.
func IsGoodMove(rp *super_block.ReplicaPlacement, existingReplicas []types.ReplicaLocation, sourceNodeID string, target types.ReplicaLocation) bool {
if rp == nil || !rp.HasReplication() {
return true // no replication constraint
}
// Build the replica set after the move: remove source, add target
afterMove := make([]types.ReplicaLocation, 0, len(existingReplicas))
sourceFound := false
for _, r := range existingReplicas {
if r.NodeID == sourceNodeID {
sourceFound = true
} else {
afterMove = append(afterMove, r)
}
}
if !sourceFound {
// Source not in replica list — cluster state may be inconsistent.
// Treat as unsafe to avoid incorrect placement decisions.
return false
}
return satisfyReplicaPlacement(rp, afterMove, target)
}
// satisfyReplicaPlacement checks whether placing a replica at target
// is consistent with the replication policy, given the existing replicas.
// Ported from weed/shell/command_volume_fix_replication.go
func satisfyReplicaPlacement(rp *super_block.ReplicaPlacement, replicas []types.ReplicaLocation, target types.ReplicaLocation) bool {
existingDCs, _, existingNodes := countReplicas(replicas)
targetNK := nodeKey{DataCenter: target.DataCenter, Rack: target.Rack, NodeID: target.NodeID}
if _, found := existingNodes[targetNK]; found {
// avoid duplicated volume on the same data node
return false
}
primaryDCs, _ := findTopDCKeys(existingDCs)
// ensure data center count is within limit
if _, found := existingDCs[target.DataCenter]; !found {
// different from existing dcs
if len(existingDCs) < rp.DiffDataCenterCount+1 {
return true
}
return false
}
// now same as one of existing data centers
if !slices.Contains(primaryDCs, target.DataCenter) {
return false
}
// now on a primary dc - check racks within this DC
primaryDcRacks := make(map[rackKey]int)
for _, r := range replicas {
if r.DataCenter != target.DataCenter {
continue
}
primaryDcRacks[rackKey{DataCenter: r.DataCenter, Rack: r.Rack}]++
}
targetRK := rackKey{DataCenter: target.DataCenter, Rack: target.Rack}
primaryRacks, _ := findTopRackKeys(primaryDcRacks)
sameRackCount := primaryDcRacks[targetRK]
if _, found := primaryDcRacks[targetRK]; !found {
// different from existing racks
if len(primaryDcRacks) < rp.DiffRackCount+1 {
return true
}
return false
}
// same as one of existing racks
if !slices.Contains(primaryRacks, targetRK) {
return false
}
// on primary rack - check same-rack count
if sameRackCount < rp.SameRackCount+1 {
return true
}
return false
}
func countReplicas(replicas []types.ReplicaLocation) (dcCounts map[string]int, rackCounts map[rackKey]int, nodeCounts map[nodeKey]int) {
dcCounts = make(map[string]int)
rackCounts = make(map[rackKey]int)
nodeCounts = make(map[nodeKey]int)
for _, r := range replicas {
dcCounts[r.DataCenter]++
rackCounts[rackKey{DataCenter: r.DataCenter, Rack: r.Rack}]++
nodeCounts[nodeKey{DataCenter: r.DataCenter, Rack: r.Rack, NodeID: r.NodeID}]++
}
return
}
func findTopDCKeys(m map[string]int) (topKeys []string, max int) {
for k, c := range m {
if max < c {
topKeys = topKeys[:0]
topKeys = append(topKeys, k)
max = c
} else if max == c {
topKeys = append(topKeys, k)
}
}
return
}
func findTopRackKeys(m map[rackKey]int) (topKeys []rackKey, max int) {
for k, c := range m {
if max < c {
topKeys = topKeys[:0]
topKeys = append(topKeys, k)
max = c
} else if max == c {
topKeys = append(topKeys, k)
}
}
return
}

View File

@@ -0,0 +1,127 @@
package balance
import (
"testing"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
func rp(t *testing.T, code string) *super_block.ReplicaPlacement {
t.Helper()
r, err := super_block.NewReplicaPlacementFromString(code)
if err != nil {
t.Fatalf("invalid replica placement code %q: %v", code, err)
}
return r
}
func loc(dc, rack, node string) types.ReplicaLocation {
return types.ReplicaLocation{DataCenter: dc, Rack: rack, NodeID: node}
}
func TestIsGoodMove_NoReplication(t *testing.T) {
// 000 = no replication. Any move is fine.
if !IsGoodMove(rp(t, "000"), []types.ReplicaLocation{loc("dc1", "r1", "n1")}, "n1", loc("dc1", "r1", "n2")) {
t.Error("000: any move should be allowed")
}
}
func TestIsGoodMove_001_SameRack(t *testing.T) {
// 001 = 1 replica on same rack (2 total on same rack)
existing := []types.ReplicaLocation{
loc("dc1", "r1", "n1"),
loc("dc1", "r1", "n2"),
}
// Move n1 -> n3 on same rack: good
if !IsGoodMove(rp(t, "001"), existing, "n1", loc("dc1", "r1", "n3")) {
t.Error("001: move to same rack should be allowed")
}
// Move n1 -> n3 on different rack: bad (would leave only 1 on r1, need 2)
if IsGoodMove(rp(t, "001"), existing, "n1", loc("dc1", "r2", "n3")) {
t.Error("001: move to different rack should not be allowed when it breaks same-rack count")
}
}
func TestIsGoodMove_010_DiffRack(t *testing.T) {
// 010 = 1 replica on different rack (2 racks total)
existing := []types.ReplicaLocation{
loc("dc1", "r1", "n1"),
loc("dc1", "r2", "n2"),
}
// Move n1 -> n3 on r2: bad (both replicas on same rack)
if IsGoodMove(rp(t, "010"), existing, "n1", loc("dc1", "r2", "n3")) {
t.Error("010: move to same rack as other replica should not be allowed")
}
// Move n1 -> n3 on r3: good (still 2 different racks)
if !IsGoodMove(rp(t, "010"), existing, "n1", loc("dc1", "r3", "n3")) {
t.Error("010: move to different rack should be allowed")
}
}
func TestIsGoodMove_100_DiffDC(t *testing.T) {
// 100 = 1 replica in different DC
existing := []types.ReplicaLocation{
loc("dc1", "r1", "n1"),
loc("dc2", "r1", "n2"),
}
// Move n1 -> n3 in dc2: bad (both in same DC)
if IsGoodMove(rp(t, "100"), existing, "n1", loc("dc2", "r1", "n3")) {
t.Error("100: move to same DC as other replica should not be allowed")
}
// Move n1 -> n3 in dc3: good (different DCs)
if !IsGoodMove(rp(t, "100"), existing, "n1", loc("dc3", "r1", "n3")) {
t.Error("100: move to different DC should be allowed")
}
}
func TestIsGoodMove_SameNode(t *testing.T) {
// Moving to the same node as an existing replica should always be rejected
existing := []types.ReplicaLocation{
loc("dc1", "r1", "n1"),
loc("dc1", "r2", "n2"),
}
if IsGoodMove(rp(t, "010"), existing, "n1", loc("dc1", "r2", "n2")) {
t.Error("should reject move to same node as existing replica")
}
}
func TestIsGoodMove_011_Composite(t *testing.T) {
// 011 = 1 same-rack + 1 different-rack (3 replicas: 2 on same rack, 1 on different)
existing := []types.ReplicaLocation{
loc("dc1", "r1", "n1"),
loc("dc1", "r1", "n2"),
loc("dc1", "r2", "n3"),
}
// Move n1 -> n4 on r1: good (maintains 2 on r1, 1 on r2)
if !IsGoodMove(rp(t, "011"), existing, "n1", loc("dc1", "r1", "n4")) {
t.Error("011: move within same rack should be allowed")
}
// Move n3 -> n4 on r1: bad (would have 3 on r1, 0 on different rack)
if IsGoodMove(rp(t, "011"), existing, "n3", loc("dc1", "r1", "n4")) {
t.Error("011: move that eliminates different-rack replica should not be allowed")
}
}
func TestIsGoodMove_110_Composite(t *testing.T) {
// 110 = 1 different-rack + 1 different-DC (3 replicas across 2 DCs and 2 racks)
existing := []types.ReplicaLocation{
loc("dc1", "r1", "n1"),
loc("dc1", "r2", "n2"),
loc("dc2", "r1", "n3"),
}
// Move n1 -> n4 in dc1/r3: good (dc1 still has r2+r3, dc2 has r1)
if !IsGoodMove(rp(t, "110"), existing, "n1", loc("dc1", "r3", "n4")) {
t.Error("110: move to new rack in same DC should be allowed")
}
// Move n3 -> n4 in dc1/r1: bad (would lose the different-DC replica)
if IsGoodMove(rp(t, "110"), existing, "n3", loc("dc1", "r1", "n4")) {
t.Error("110: move that eliminates different-DC replica should not be allowed")
}
}
func TestIsGoodMove_NilReplicaPlacement(t *testing.T) {
if !IsGoodMove(nil, []types.ReplicaLocation{loc("dc1", "r1", "n1")}, "n1", loc("dc1", "r1", "n2")) {
t.Error("nil replica placement should allow any move")
}
}

View File

@@ -6,13 +6,21 @@ import (
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
)
// ReplicaLocation identifies where a volume replica lives.
type ReplicaLocation struct {
DataCenter string
Rack string
NodeID string
}
// ClusterInfo contains cluster information for task detection
type ClusterInfo struct {
Servers []*VolumeServerInfo
TotalVolumes int
TotalServers int
LastUpdated time.Time
ActiveTopology *topology.ActiveTopology // Added for destination planning in detection
Servers []*VolumeServerInfo
TotalVolumes int
TotalServers int
LastUpdated time.Time
ActiveTopology *topology.ActiveTopology // Added for destination planning in detection
VolumeReplicaMap map[uint32][]ReplicaLocation
}
// VolumeHealthMetrics contains health information about a volume (simplified)