Enhance EC balancing to separate parity and data shards (#8038)
* Enhance EC balancing to separate parity and data shards across racks * Rename avoidRacks to antiAffinityRacks for clarity * Implement server-level EC separation for parity/data shards * Optimize EC balancing: consolidate helpers and extract two-pass selection logic * Add comprehensive edge case tests for EC balancing logic * Apply code review feedback: rename select_(), add divide-by-zero guard, fix comment * Remove unused parameters from doBalanceEcShardsWithinOneRack and add explicit anti-affinity check * Add disk-level anti-affinity for data/parity shard separation - Modified pickBestDiskOnNode to accept shardId and dataShardCount - Implemented explicit anti-affinity: 1000-point penalty for placing data shards on disks with parity (and vice versa) - Updated all call sites including balancing and evacuation - For evacuation, disabled anti-affinity by passing dataShardCount=0
This commit is contained in:
@@ -809,24 +809,48 @@ func countShardsByRack(vid needle.VolumeId, locations []*EcNode, diskType types.
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// shardsByTypePerRack counts data shards (< dataShards) and parity shards (>= dataShards) per rack
|
// shardsByType is a generic helper that counts data and parity shards per group
|
||||||
func shardsByTypePerRack(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType, dataShards int) (dataPerRack, parityPerRack map[string][]erasure_coding.ShardId) {
|
func shardsByType(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType, dataShards int, keyExtractor func(*EcNode) string) (dataPerGroup, parityPerGroup map[string][]erasure_coding.ShardId) {
|
||||||
dataPerRack = make(map[string][]erasure_coding.ShardId)
|
dataPerGroup = make(map[string][]erasure_coding.ShardId)
|
||||||
parityPerRack = make(map[string][]erasure_coding.ShardId)
|
parityPerGroup = make(map[string][]erasure_coding.ShardId)
|
||||||
for _, ecNode := range locations {
|
for _, ecNode := range locations {
|
||||||
si := findEcVolumeShardsInfo(ecNode, vid, diskType)
|
si := findEcVolumeShardsInfo(ecNode, vid, diskType)
|
||||||
rackId := string(ecNode.rack)
|
groupKey := keyExtractor(ecNode)
|
||||||
for _, shardId := range si.Ids() {
|
for _, shardId := range si.Ids() {
|
||||||
if int(shardId) < dataShards {
|
if int(shardId) < dataShards {
|
||||||
dataPerRack[rackId] = append(dataPerRack[rackId], shardId)
|
dataPerGroup[groupKey] = append(dataPerGroup[groupKey], shardId)
|
||||||
} else {
|
} else {
|
||||||
parityPerRack[rackId] = append(parityPerRack[rackId], shardId)
|
parityPerGroup[groupKey] = append(parityPerGroup[groupKey], shardId)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// shardsByTypePerRack counts data shards (< dataShards) and parity shards (>= dataShards) per rack
|
||||||
|
func shardsByTypePerRack(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType, dataShards int) (dataPerRack, parityPerRack map[string][]erasure_coding.ShardId) {
|
||||||
|
return shardsByType(vid, locations, diskType, dataShards, func(ecNode *EcNode) string {
|
||||||
|
return string(ecNode.rack)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// shardsByTypePerNode counts data shards and parity shards per node
|
||||||
|
func shardsByTypePerNode(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType, dataShards int) (dataPerNode, parityPerNode map[string][]erasure_coding.ShardId) {
|
||||||
|
return shardsByType(vid, locations, diskType, dataShards, func(ecNode *EcNode) string {
|
||||||
|
return ecNode.info.Id
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func countShardsByNode(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType) map[string]int {
|
||||||
|
return groupByCount(locations, func(ecNode *EcNode) (id string, count int) {
|
||||||
|
id = ecNode.info.Id
|
||||||
|
if si := findEcVolumeShardsInfo(ecNode, vid, diskType); si != nil {
|
||||||
|
count = si.Count()
|
||||||
|
}
|
||||||
|
return
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func (ecb *ecBalancer) doBalanceEcShardsAcrossRacks(collection string, vid needle.VolumeId, locations []*EcNode) error {
|
func (ecb *ecBalancer) doBalanceEcShardsAcrossRacks(collection string, vid needle.VolumeId, locations []*EcNode) error {
|
||||||
racks := ecb.racks()
|
racks := ecb.racks()
|
||||||
numRacks := len(racks)
|
numRacks := len(racks)
|
||||||
@@ -852,20 +876,30 @@ func (ecb *ecBalancer) doBalanceEcShardsAcrossRacks(collection string, vid needl
|
|||||||
rackToShardCount := countShardsByRack(vid, locations, ecb.diskType)
|
rackToShardCount := countShardsByRack(vid, locations, ecb.diskType)
|
||||||
|
|
||||||
// First pass: Balance data shards across racks
|
// First pass: Balance data shards across racks
|
||||||
if err := ecb.balanceShardTypeAcrossRacks(collection, vid, racks, rackEcNodesWithVid, dataPerRack, rackToShardCount, maxDataPerRack, "data"); err != nil {
|
if err := ecb.balanceShardTypeAcrossRacks(collection, vid, racks, rackEcNodesWithVid, dataPerRack, rackToShardCount, maxDataPerRack, "data", nil); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Refresh locations after data shard moves and get parity distribution
|
// Refresh locations after data shard moves and get parity distribution
|
||||||
locations = ecb.collectVolumeIdToEcNodes(collection)[vid]
|
locations = ecb.collectVolumeIdToEcNodes(collection)[vid]
|
||||||
_, parityPerRack := shardsByTypePerRack(vid, locations, ecb.diskType, dataShardCount)
|
dataPerRack, parityPerRack := shardsByTypePerRack(vid, locations, ecb.diskType, dataShardCount)
|
||||||
rackEcNodesWithVid = groupBy(locations, func(ecNode *EcNode) string {
|
rackEcNodesWithVid = groupBy(locations, func(ecNode *EcNode) string {
|
||||||
return string(ecNode.rack)
|
return string(ecNode.rack)
|
||||||
})
|
})
|
||||||
rackToShardCount = countShardsByRack(vid, locations, ecb.diskType)
|
rackToShardCount = countShardsByRack(vid, locations, ecb.diskType)
|
||||||
|
|
||||||
// Second pass: Balance parity shards across racks
|
// Identify racks containing data shards to avoid for parity placement.
|
||||||
if err := ecb.balanceShardTypeAcrossRacks(collection, vid, racks, rackEcNodesWithVid, parityPerRack, rackToShardCount, maxParityPerRack, "parity"); err != nil {
|
// We call this "antiAffinityRacks" because we want parity shards to have anti-affinity
|
||||||
|
// with racks that hold data shards, to ensure better fault tolerance.
|
||||||
|
antiAffinityRacks := make(map[string]bool)
|
||||||
|
for rackId, shards := range dataPerRack {
|
||||||
|
if len(shards) > 0 {
|
||||||
|
antiAffinityRacks[rackId] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass: Balance parity shards across racks, ignoring racks with data shards if possible
|
||||||
|
if err := ecb.balanceShardTypeAcrossRacks(collection, vid, racks, rackEcNodesWithVid, parityPerRack, rackToShardCount, maxParityPerRack, "parity", antiAffinityRacks); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -882,6 +916,7 @@ func (ecb *ecBalancer) balanceShardTypeAcrossRacks(
|
|||||||
rackToShardCount map[string]int,
|
rackToShardCount map[string]int,
|
||||||
maxPerRack int,
|
maxPerRack int,
|
||||||
shardType string,
|
shardType string,
|
||||||
|
antiAffinityRacks map[string]bool,
|
||||||
) error {
|
) error {
|
||||||
// Find racks with too many shards of this type
|
// Find racks with too many shards of this type
|
||||||
shardsToMove := make(map[erasure_coding.ShardId]*EcNode)
|
shardsToMove := make(map[erasure_coding.ShardId]*EcNode)
|
||||||
@@ -908,7 +943,7 @@ func (ecb *ecBalancer) balanceShardTypeAcrossRacks(
|
|||||||
// Move shards to racks that have fewer than maxPerRack of this type
|
// Move shards to racks that have fewer than maxPerRack of this type
|
||||||
for shardId, ecNode := range shardsToMove {
|
for shardId, ecNode := range shardsToMove {
|
||||||
// Find destination rack with room for this shard type
|
// Find destination rack with room for this shard type
|
||||||
destRackId, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, rackToShardCount)
|
destRackId, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, rackToShardCount, antiAffinityRacks)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Printf("ec %s shard %d.%d at %s can not find a destination rack:\n%s\n", shardType, vid, shardId, ecNode.info.Id, err.Error())
|
fmt.Printf("ec %s shard %d.%d at %s can not find a destination rack:\n%s\n", shardType, vid, shardId, ecNode.info.Id, err.Error())
|
||||||
continue
|
continue
|
||||||
@@ -942,41 +977,145 @@ func (ecb *ecBalancer) balanceShardTypeAcrossRacks(
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// twoPassSelector implements two-pass selection with anti-affinity
|
||||||
|
// Pass 1: Select from candidates NOT in antiAffinity set
|
||||||
|
// Pass 2: Fallback to any valid candidate if Pass 1 yields no results
|
||||||
|
type twoPassSelector[T any] struct {
|
||||||
|
candidates []T
|
||||||
|
shardsPerTarget map[string][]erasure_coding.ShardId
|
||||||
|
maxPerTarget int
|
||||||
|
targetToShardCount map[string]int
|
||||||
|
antiAffinity map[string]bool
|
||||||
|
|
||||||
|
// Functions to extract info from candidate
|
||||||
|
getKey func(T) string
|
||||||
|
hasFreeSlots func(T) bool
|
||||||
|
checkLimit func(T) bool // replica placement or other limits
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *twoPassSelector[T]) selectCandidate() (T, error) {
|
||||||
|
var selected []T
|
||||||
|
minShards := s.maxPerTarget + 1
|
||||||
|
|
||||||
|
// Pass 1: Try candidates NOT in anti-affinity set
|
||||||
|
for _, candidate := range s.candidates {
|
||||||
|
if !s.hasFreeSlots(candidate) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := s.getKey(candidate)
|
||||||
|
currentCount := len(s.shardsPerTarget[key])
|
||||||
|
if currentCount >= s.maxPerTarget {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !s.checkLimit(candidate) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip anti-affinity targets in pass 1
|
||||||
|
if s.antiAffinity != nil && s.antiAffinity[key] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if currentCount < minShards {
|
||||||
|
selected = nil
|
||||||
|
minShards = currentCount
|
||||||
|
}
|
||||||
|
if currentCount == minShards {
|
||||||
|
selected = append(selected, candidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pass 2: Fallback if no candidates found
|
||||||
|
if len(selected) == 0 {
|
||||||
|
minShards = s.maxPerTarget + 1
|
||||||
|
for _, candidate := range s.candidates {
|
||||||
|
if !s.hasFreeSlots(candidate) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := s.getKey(candidate)
|
||||||
|
currentCount := len(s.shardsPerTarget[key])
|
||||||
|
if currentCount >= s.maxPerTarget {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !s.checkLimit(candidate) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if currentCount < minShards {
|
||||||
|
selected = nil
|
||||||
|
minShards = currentCount
|
||||||
|
}
|
||||||
|
if currentCount == minShards {
|
||||||
|
selected = append(selected, candidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(selected) == 0 {
|
||||||
|
var zero T
|
||||||
|
return zero, errors.New("no valid candidate available")
|
||||||
|
}
|
||||||
|
return selected[rand.IntN(len(selected))], nil
|
||||||
|
}
|
||||||
|
|
||||||
// pickRackForShardType selects a rack that has room for more shards of a specific type
|
// pickRackForShardType selects a rack that has room for more shards of a specific type
|
||||||
func (ecb *ecBalancer) pickRackForShardType(
|
func (ecb *ecBalancer) pickRackForShardType(
|
||||||
rackToEcNodes map[RackId]*EcRack,
|
rackToEcNodes map[RackId]*EcRack,
|
||||||
shardsPerRack map[string][]erasure_coding.ShardId,
|
shardsPerRack map[string][]erasure_coding.ShardId,
|
||||||
maxPerRack int,
|
maxPerRack int,
|
||||||
rackToShardCount map[string]int,
|
rackToShardCount map[string]int,
|
||||||
|
antiAffinityRacks map[string]bool,
|
||||||
) (RackId, error) {
|
) (RackId, error) {
|
||||||
var candidates []RackId
|
// Convert map to slice for iteration
|
||||||
minShards := maxPerRack + 1
|
var rackCandidates []struct {
|
||||||
|
id RackId
|
||||||
for rackId, rack := range rackToEcNodes {
|
rack *EcRack
|
||||||
if rack.freeEcSlot <= 0 {
|
}
|
||||||
continue
|
for id, rack := range rackToEcNodes {
|
||||||
}
|
rackCandidates = append(rackCandidates, struct {
|
||||||
currentCount := len(shardsPerRack[string(rackId)])
|
id RackId
|
||||||
if currentCount >= maxPerRack {
|
rack *EcRack
|
||||||
continue
|
}{id, rack})
|
||||||
}
|
|
||||||
// For EC shards, replica placement constraint only applies when DiffRackCount > 0.
|
|
||||||
if ecb.replicaPlacement != nil && ecb.replicaPlacement.DiffRackCount > 0 && rackToShardCount[string(rackId)] >= ecb.replicaPlacement.DiffRackCount {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if currentCount < minShards {
|
|
||||||
candidates = nil
|
|
||||||
minShards = currentCount
|
|
||||||
}
|
|
||||||
if currentCount == minShards {
|
|
||||||
candidates = append(candidates, rackId)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(candidates) == 0 {
|
selector := &twoPassSelector[struct {
|
||||||
|
id RackId
|
||||||
|
rack *EcRack
|
||||||
|
}]{
|
||||||
|
candidates: rackCandidates,
|
||||||
|
shardsPerTarget: shardsPerRack,
|
||||||
|
maxPerTarget: maxPerRack,
|
||||||
|
targetToShardCount: rackToShardCount,
|
||||||
|
antiAffinity: antiAffinityRacks,
|
||||||
|
getKey: func(c struct {
|
||||||
|
id RackId
|
||||||
|
rack *EcRack
|
||||||
|
}) string {
|
||||||
|
return string(c.id)
|
||||||
|
},
|
||||||
|
hasFreeSlots: func(c struct {
|
||||||
|
id RackId
|
||||||
|
rack *EcRack
|
||||||
|
}) bool {
|
||||||
|
return c.rack.freeEcSlot > 0
|
||||||
|
},
|
||||||
|
checkLimit: func(c struct {
|
||||||
|
id RackId
|
||||||
|
rack *EcRack
|
||||||
|
}) bool {
|
||||||
|
// For EC shards, replica placement constraint only applies when DiffRackCount > 0.
|
||||||
|
if ecb.replicaPlacement != nil && ecb.replicaPlacement.DiffRackCount > 0 {
|
||||||
|
return rackToShardCount[string(c.id)] < ecb.replicaPlacement.DiffRackCount
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
selected, err := selector.selectCandidate()
|
||||||
|
if err != nil {
|
||||||
return "", errors.New("no rack available for shard type balancing")
|
return "", errors.New("no rack available for shard type balancing")
|
||||||
}
|
}
|
||||||
return candidates[rand.IntN(len(candidates))], nil
|
return selected.id, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ecb *ecBalancer) pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int) (RackId, error) {
|
func (ecb *ecBalancer) pickRackToBalanceShardsInto(rackToEcNodes map[RackId]*EcRack, rackToShardCount map[string]int) (RackId, error) {
|
||||||
@@ -1032,51 +1171,74 @@ func (ecb *ecBalancer) balanceEcShardsWithinRacks(collection string) error {
|
|||||||
|
|
||||||
// see the volume's shards are in how many racks, and how many in each rack
|
// see the volume's shards are in how many racks, and how many in each rack
|
||||||
rackToShardCount := countShardsByRack(vid, locations, ecb.diskType)
|
rackToShardCount := countShardsByRack(vid, locations, ecb.diskType)
|
||||||
rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string {
|
|
||||||
return string(ecNode.rack)
|
|
||||||
})
|
|
||||||
|
|
||||||
for rackId, _ := range rackToShardCount {
|
|
||||||
|
|
||||||
|
for rackId := range rackToShardCount {
|
||||||
var possibleDestinationEcNodes []*EcNode
|
var possibleDestinationEcNodes []*EcNode
|
||||||
for _, n := range racks[RackId(rackId)].ecNodes {
|
for _, n := range racks[RackId(rackId)].ecNodes {
|
||||||
if _, found := n.info.DiskInfos[string(ecb.diskType)]; found {
|
if _, found := n.info.DiskInfos[string(ecb.diskType)]; found {
|
||||||
possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
|
possibleDestinationEcNodes = append(possibleDestinationEcNodes, n)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sourceEcNodes := rackEcNodesWithVid[rackId]
|
|
||||||
averageShardsPerEcNode := ceilDivide(rackToShardCount[rackId], len(possibleDestinationEcNodes))
|
|
||||||
ewg.Add(func() error {
|
ewg.Add(func() error {
|
||||||
return ecb.doBalanceEcShardsWithinOneRack(averageShardsPerEcNode, collection, vid, sourceEcNodes, possibleDestinationEcNodes)
|
return ecb.doBalanceEcShardsWithinOneRack(collection, vid, possibleDestinationEcNodes)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ewg.Wait()
|
return ewg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ecb *ecBalancer) doBalanceEcShardsWithinOneRack(averageShardsPerEcNode int, collection string, vid needle.VolumeId, existingLocations, possibleDestinationEcNodes []*EcNode) error {
|
func (ecb *ecBalancer) doBalanceEcShardsWithinOneRack(collection string, vid needle.VolumeId, possibleDestinationEcNodes []*EcNode) error {
|
||||||
for _, ecNode := range existingLocations {
|
// Use configured EC scheme
|
||||||
|
dataShardCount := ecb.getDataShardCount()
|
||||||
|
|
||||||
si := findEcVolumeShardsInfo(ecNode, vid, ecb.diskType)
|
// Get current distribution of data shards per node
|
||||||
overLimitCount := si.Count() - averageShardsPerEcNode
|
dataPerNode, parityPerNode := shardsByTypePerNode(vid, possibleDestinationEcNodes, ecb.diskType, dataShardCount)
|
||||||
|
|
||||||
for _, shardId := range si.Ids() {
|
// Calculate max shards per node for each type
|
||||||
|
numNodes := len(possibleDestinationEcNodes)
|
||||||
|
if numNodes == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
if overLimitCount <= 0 {
|
// Calculate totals based on actual shards present in the rack (subset of all shards)
|
||||||
break
|
totalData := 0
|
||||||
}
|
for _, shards := range dataPerNode {
|
||||||
|
totalData += len(shards)
|
||||||
|
}
|
||||||
|
totalParity := 0
|
||||||
|
for _, shards := range parityPerNode {
|
||||||
|
totalParity += len(shards)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Printf("%s has %d overlimit, moving ec shard %d.%d\n", ecNode.info.Id, overLimitCount, vid, shardId)
|
maxDataPerNode := ceilDivide(totalData, numNodes)
|
||||||
|
maxParityPerNode := ceilDivide(totalParity, numNodes)
|
||||||
|
|
||||||
err := ecb.pickOneEcNodeAndMoveOneShard(ecNode, collection, vid, shardId, possibleDestinationEcNodes)
|
// Track total shard count per node
|
||||||
if err != nil {
|
nodeToShardCount := countShardsByNode(vid, possibleDestinationEcNodes, ecb.diskType)
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
overLimitCount--
|
// First pass: Balance data shards across nodes
|
||||||
|
if err := ecb.balanceShardTypeAcrossNodes(collection, vid, possibleDestinationEcNodes, dataPerNode, nodeToShardCount, maxDataPerNode, "data", nil); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Refresh locations after data shard moves
|
||||||
|
// We need to re-scan because moving shards changes node states
|
||||||
|
dataPerNode, parityPerNode = shardsByTypePerNode(vid, possibleDestinationEcNodes, ecb.diskType, dataShardCount)
|
||||||
|
nodeToShardCount = countShardsByNode(vid, possibleDestinationEcNodes, ecb.diskType)
|
||||||
|
|
||||||
|
// Identify nodes containing data shards to avoid for parity placement
|
||||||
|
antiAffinityNodes := make(map[string]bool)
|
||||||
|
for nodeId, shards := range dataPerNode {
|
||||||
|
if len(shards) > 0 {
|
||||||
|
antiAffinityNodes[nodeId] = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Second pass: Balance parity shards across nodes, avoiding nodes with data shards if possible
|
||||||
|
if err := ecb.balanceShardTypeAcrossNodes(collection, vid, possibleDestinationEcNodes, parityPerNode, nodeToShardCount, maxParityPerNode, "parity", antiAffinityNodes); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1144,7 +1306,9 @@ func (ecb *ecBalancer) doBalanceEcRack(ecRack *EcRack) error {
|
|||||||
for _, shardId := range si.Ids() {
|
for _, shardId := range si.Ids() {
|
||||||
vid := needle.VolumeId(shards.Id)
|
vid := needle.VolumeId(shards.Id)
|
||||||
// For balancing, strictly require matching disk type
|
// For balancing, strictly require matching disk type
|
||||||
destDiskId := pickBestDiskOnNode(emptyNode, vid, ecb.diskType, true)
|
// For balancing, strictly require matching disk type and apply anti-affinity
|
||||||
|
dataShardCount := ecb.getDataShardCount()
|
||||||
|
destDiskId := pickBestDiskOnNode(emptyNode, vid, ecb.diskType, true, shardId, dataShardCount)
|
||||||
|
|
||||||
if destDiskId > 0 {
|
if destDiskId > 0 {
|
||||||
fmt.Printf("%s moves ec shards %d.%d to %s (disk %d)\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id, destDiskId)
|
fmt.Printf("%s moves ec shards %d.%d to %s (disk %d)\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id, destDiskId)
|
||||||
@@ -1265,8 +1429,11 @@ func diskDistributionScore(ecNode *EcNode, vid needle.VolumeId) int {
|
|||||||
|
|
||||||
// pickBestDiskOnNode selects the best disk on a node for placing a new EC shard
|
// pickBestDiskOnNode selects the best disk on a node for placing a new EC shard
|
||||||
// It prefers disks of the specified type with fewer shards and more free slots
|
// It prefers disks of the specified type with fewer shards and more free slots
|
||||||
|
// When shardId is provided and dataShardCount > 0, it applies anti-affinity:
|
||||||
|
// - For data shards (shardId < dataShardCount): prefer disks without parity shards
|
||||||
|
// - For parity shards (shardId >= dataShardCount): prefer disks without data shards
|
||||||
// If strictDiskType is false, it will fall back to other disk types if no matching disk is found
|
// If strictDiskType is false, it will fall back to other disk types if no matching disk is found
|
||||||
func pickBestDiskOnNode(ecNode *EcNode, vid needle.VolumeId, diskType types.DiskType, strictDiskType bool) uint32 {
|
func pickBestDiskOnNode(ecNode *EcNode, vid needle.VolumeId, diskType types.DiskType, strictDiskType bool, shardId erasure_coding.ShardId, dataShardCount int) uint32 {
|
||||||
if len(ecNode.disks) == 0 {
|
if len(ecNode.disks) == 0 {
|
||||||
return 0 // No disk info available, let the server decide
|
return 0 // No disk info available, let the server decide
|
||||||
}
|
}
|
||||||
@@ -1276,21 +1443,47 @@ func pickBestDiskOnNode(ecNode *EcNode, vid needle.VolumeId, diskType types.Disk
|
|||||||
var fallbackDiskId uint32
|
var fallbackDiskId uint32
|
||||||
fallbackScore := -1
|
fallbackScore := -1
|
||||||
|
|
||||||
|
// Determine if we're placing a data or parity shard
|
||||||
|
isDataShard := dataShardCount > 0 && int(shardId) < dataShardCount
|
||||||
|
|
||||||
for diskId, disk := range ecNode.disks {
|
for diskId, disk := range ecNode.disks {
|
||||||
if disk.freeEcSlots <= 0 {
|
if disk.freeEcSlots <= 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if this volume already has shards on this disk
|
// Check existing shards on this disk for this volume
|
||||||
existingShards := 0
|
existingShards := 0
|
||||||
|
hasDataShards := false
|
||||||
|
hasParityShards := false
|
||||||
if si, ok := disk.ecShards[vid]; ok {
|
if si, ok := disk.ecShards[vid]; ok {
|
||||||
existingShards = si.Count()
|
existingShards = si.Count()
|
||||||
|
// Check what type of shards are on this disk
|
||||||
|
if dataShardCount > 0 {
|
||||||
|
for _, existingShardId := range si.Ids() {
|
||||||
|
if int(existingShardId) < dataShardCount {
|
||||||
|
hasDataShards = true
|
||||||
|
} else {
|
||||||
|
hasParityShards = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Score: prefer disks with fewer total shards and fewer shards of this volume
|
// Score: prefer disks with fewer total shards and fewer shards of this volume
|
||||||
// Lower score is better
|
// Lower score is better
|
||||||
score := disk.ecShardCount*10 + existingShards*100
|
score := disk.ecShardCount*10 + existingShards*100
|
||||||
|
|
||||||
|
// Apply anti-affinity penalty if applicable
|
||||||
|
if dataShardCount > 0 {
|
||||||
|
if isDataShard && hasParityShards {
|
||||||
|
// Penalize placing data shard on disk with parity shards
|
||||||
|
score += 1000
|
||||||
|
} else if !isDataShard && hasDataShards {
|
||||||
|
// Penalize placing parity shard on disk with data shards
|
||||||
|
score += 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if disk.diskType == string(diskType) {
|
if disk.diskType == string(diskType) {
|
||||||
// Matching disk type - this is preferred
|
// Matching disk type - this is preferred
|
||||||
if bestScore == -1 || score < bestScore {
|
if bestScore == -1 || score < bestScore {
|
||||||
@@ -1314,19 +1507,20 @@ func pickBestDiskOnNode(ecNode *EcNode, vid needle.VolumeId, diskType types.Disk
|
|||||||
}
|
}
|
||||||
|
|
||||||
// pickEcNodeAndDiskToBalanceShardsInto picks both a destination node and specific disk
|
// pickEcNodeAndDiskToBalanceShardsInto picks both a destination node and specific disk
|
||||||
func (ecb *ecBalancer) pickEcNodeAndDiskToBalanceShardsInto(vid needle.VolumeId, existingLocation *EcNode, possibleDestinations []*EcNode) (*EcNode, uint32, error) {
|
func (ecb *ecBalancer) pickEcNodeAndDiskToBalanceShardsInto(vid needle.VolumeId, shardId erasure_coding.ShardId, existingLocation *EcNode, possibleDestinations []*EcNode) (*EcNode, uint32, error) {
|
||||||
node, err := ecb.pickEcNodeToBalanceShardsInto(vid, existingLocation, possibleDestinations)
|
node, err := ecb.pickEcNodeToBalanceShardsInto(vid, existingLocation, possibleDestinations)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// For balancing, strictly require matching disk type
|
// For balancing, strictly require matching disk type and apply anti-affinity
|
||||||
diskId := pickBestDiskOnNode(node, vid, ecb.diskType, true)
|
dataShardCount := ecb.getDataShardCount()
|
||||||
|
diskId := pickBestDiskOnNode(node, vid, ecb.diskType, true, shardId, dataShardCount)
|
||||||
return node, diskId, nil
|
return node, diskId, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ecb *ecBalancer) pickOneEcNodeAndMoveOneShard(existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode) error {
|
func (ecb *ecBalancer) pickOneEcNodeAndMoveOneShard(existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode) error {
|
||||||
destNode, destDiskId, err := ecb.pickEcNodeAndDiskToBalanceShardsInto(vid, existingLocation, possibleDestinationEcNodes)
|
destNode, destDiskId, err := ecb.pickEcNodeAndDiskToBalanceShardsInto(vid, shardId, existingLocation, possibleDestinationEcNodes)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Printf("WARNING: Could not find suitable target node for %d.%d:\n%s", vid, shardId, err.Error())
|
fmt.Printf("WARNING: Could not find suitable target node for %d.%d:\n%s", vid, shardId, err.Error())
|
||||||
return nil
|
return nil
|
||||||
@@ -1441,3 +1635,114 @@ func compileCollectionPattern(pattern string) (*regexp.Regexp, error) {
|
|||||||
}
|
}
|
||||||
return regexp.Compile(pattern)
|
return regexp.Compile(pattern)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// balanceShardTypeAcrossNodes spreads shards of a specific type (data or parity) evenly across nodes
|
||||||
|
func (ecb *ecBalancer) balanceShardTypeAcrossNodes(
|
||||||
|
collection string,
|
||||||
|
vid needle.VolumeId,
|
||||||
|
possibleDestinationEcNodes []*EcNode,
|
||||||
|
shardsPerNode map[string][]erasure_coding.ShardId,
|
||||||
|
nodeToShardCount map[string]int,
|
||||||
|
maxPerNode int,
|
||||||
|
shardType string,
|
||||||
|
antiAffinityNodes map[string]bool,
|
||||||
|
) error {
|
||||||
|
// Map ID to EcNode for lookup
|
||||||
|
nodeMap := make(map[string]*EcNode)
|
||||||
|
for _, n := range possibleDestinationEcNodes {
|
||||||
|
nodeMap[n.info.Id] = n
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find nodes with too many shards of this type
|
||||||
|
shardsToMove := make(map[erasure_coding.ShardId]*EcNode)
|
||||||
|
for nodeId, shards := range shardsPerNode {
|
||||||
|
if len(shards) <= maxPerNode {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Pick excess shards to move
|
||||||
|
excess := len(shards) - maxPerNode
|
||||||
|
ecNode := nodeMap[nodeId]
|
||||||
|
if ecNode == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < excess && i < len(shards); i++ {
|
||||||
|
shardId := shards[i]
|
||||||
|
// Verify node has this shard
|
||||||
|
si := findEcVolumeShardsInfo(ecNode, vid, ecb.diskType)
|
||||||
|
if si.Has(shardId) {
|
||||||
|
shardsToMove[shardId] = ecNode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move shards to nodes that have fewer than maxPerNode of this type
|
||||||
|
for shardId, ecNode := range shardsToMove {
|
||||||
|
// Find destination node with room for this shard type
|
||||||
|
destNode, err := ecb.pickNodeForShardType(possibleDestinationEcNodes, shardsPerNode, maxPerNode, nodeToShardCount, antiAffinityNodes)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("ec %s shard %d.%d at %s can not find a destination node:\n%s\n", shardType, vid, shardId, ecNode.info.Id, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
err = ecb.pickOneEcNodeAndMoveOneShard(ecNode, collection, vid, shardId, []*EcNode{destNode})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update tracking
|
||||||
|
destNodeId := destNode.info.Id
|
||||||
|
shardsPerNode[destNodeId] = append(shardsPerNode[destNodeId], shardId)
|
||||||
|
|
||||||
|
// Remove from source node
|
||||||
|
srcNodeId := ecNode.info.Id
|
||||||
|
for i, s := range shardsPerNode[srcNodeId] {
|
||||||
|
if s == shardId {
|
||||||
|
shardsPerNode[srcNodeId] = append(shardsPerNode[srcNodeId][:i], shardsPerNode[srcNodeId][i+1:]...)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nodeToShardCount[destNodeId] += 1
|
||||||
|
nodeToShardCount[srcNodeId] -= 1
|
||||||
|
destNode.freeEcSlot -= 1
|
||||||
|
ecNode.freeEcSlot += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// pickNodeForShardType selects a node that has room for more shards of a specific type
|
||||||
|
func (ecb *ecBalancer) pickNodeForShardType(
|
||||||
|
nodes []*EcNode,
|
||||||
|
shardsPerNode map[string][]erasure_coding.ShardId,
|
||||||
|
maxPerNode int,
|
||||||
|
nodeToShardCount map[string]int,
|
||||||
|
antiAffinityNodes map[string]bool,
|
||||||
|
) (*EcNode, error) {
|
||||||
|
selector := &twoPassSelector[*EcNode]{
|
||||||
|
candidates: nodes,
|
||||||
|
shardsPerTarget: shardsPerNode,
|
||||||
|
maxPerTarget: maxPerNode,
|
||||||
|
targetToShardCount: nodeToShardCount,
|
||||||
|
antiAffinity: antiAffinityNodes,
|
||||||
|
getKey: func(n *EcNode) string {
|
||||||
|
return n.info.Id
|
||||||
|
},
|
||||||
|
hasFreeSlots: func(n *EcNode) bool {
|
||||||
|
return n.freeEcSlot > 0
|
||||||
|
},
|
||||||
|
checkLimit: func(n *EcNode) bool {
|
||||||
|
// For EC shards, replica placement constraint only applies when SameRackCount > 0.
|
||||||
|
if ecb.replicaPlacement != nil && ecb.replicaPlacement.SameRackCount > 0 {
|
||||||
|
return nodeToShardCount[n.info.Id] < ecb.replicaPlacement.SameRackCount+1
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
selected, err := selector.selectCandidate()
|
||||||
|
if err != nil {
|
||||||
|
return nil, errors.New("no node available for shard type balancing")
|
||||||
|
}
|
||||||
|
return selected, nil
|
||||||
|
}
|
||||||
|
|||||||
377
weed/shell/command_ec_common_avoid_test.go
Normal file
377
weed/shell/command_ec_common_avoid_test.go
Normal file
@@ -0,0 +1,377 @@
|
|||||||
|
package shell
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||||
|
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||||
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||||
|
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
||||||
|
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestPickRackForShardType_AntiAffinityRacks(t *testing.T) {
|
||||||
|
// Setup topology with 3 racks, each with 1 node, enough free slots
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
buildRackWithEcShards("rack0", "node0:8080", 100, nil),
|
||||||
|
buildRackWithEcShards("rack1", "node1:8080", 100, nil),
|
||||||
|
buildRackWithEcShards("rack2", "node2:8080", 100, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
}
|
||||||
|
|
||||||
|
racks := ecb.racks()
|
||||||
|
rackToShardCount := make(map[string]int)
|
||||||
|
shardsPerRack := make(map[string][]erasure_coding.ShardId)
|
||||||
|
maxPerRack := 2
|
||||||
|
|
||||||
|
// Case 1: Avoid rack0
|
||||||
|
antiAffinityRacks := map[string]bool{"rack0": true}
|
||||||
|
|
||||||
|
// Try multiple times to ensure randomness doesn't accidentally pass
|
||||||
|
for i := 0; i < 20; i++ {
|
||||||
|
picked, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, rackToShardCount, antiAffinityRacks)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if picked == "rack0" {
|
||||||
|
t.Errorf("picked avoided rack rack0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 2: Fallback - avoid all racks
|
||||||
|
avoidAll := map[string]bool{"rack0": true, "rack1": true, "rack2": true}
|
||||||
|
picked, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, rackToShardCount, avoidAll)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("fallback failed: %v", err)
|
||||||
|
}
|
||||||
|
if picked == "" {
|
||||||
|
t.Errorf("expected some rack to be picked in fallback")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPickRackForShardType_EdgeCases(t *testing.T) {
|
||||||
|
t.Run("NoFreeSlots", func(t *testing.T) {
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
buildRackWithEcShards("rack0", "node0:8080", 0, nil), // maxVolumes=0
|
||||||
|
buildRackWithEcShards("rack1", "node1:8080", 0, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
}
|
||||||
|
|
||||||
|
racks := ecb.racks()
|
||||||
|
_, err := ecb.pickRackForShardType(racks, make(map[string][]erasure_coding.ShardId), 2, make(map[string]int), nil)
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error when no free slots, got nil")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("AllRacksAtMaxCapacity", func(t *testing.T) {
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
buildRackWithEcShards("rack0", "node0:8080", 100, nil),
|
||||||
|
buildRackWithEcShards("rack1", "node1:8080", 100, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
}
|
||||||
|
|
||||||
|
racks := ecb.racks()
|
||||||
|
shardsPerRack := map[string][]erasure_coding.ShardId{
|
||||||
|
"rack0": {0, 1}, // 2 shards
|
||||||
|
"rack1": {2, 3}, // 2 shards
|
||||||
|
}
|
||||||
|
maxPerRack := 2
|
||||||
|
|
||||||
|
_, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, make(map[string]int), nil)
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error when all racks at max capacity, got nil")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("ReplicaPlacementLimit", func(t *testing.T) {
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
buildRackWithEcShards("rack0", "node0:8080", 100, nil),
|
||||||
|
buildRackWithEcShards("rack1", "node1:8080", 100, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
rp, _ := super_block.NewReplicaPlacementFromString("012") // DiffRackCount = 1
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
replicaPlacement: rp,
|
||||||
|
}
|
||||||
|
|
||||||
|
racks := ecb.racks()
|
||||||
|
rackToShardCount := map[string]int{
|
||||||
|
"rack0": 1, // At limit
|
||||||
|
"rack1": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
picked, err := ecb.pickRackForShardType(racks, make(map[string][]erasure_coding.ShardId), 5, rackToShardCount, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if picked != "rack1" {
|
||||||
|
t.Errorf("expected rack1 (not at limit), got %v", picked)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("PreferFewerShards", func(t *testing.T) {
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
buildRackWithEcShards("rack0", "node0:8080", 100, nil),
|
||||||
|
buildRackWithEcShards("rack1", "node1:8080", 100, nil),
|
||||||
|
buildRackWithEcShards("rack2", "node2:8080", 100, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
}
|
||||||
|
|
||||||
|
racks := ecb.racks()
|
||||||
|
shardsPerRack := map[string][]erasure_coding.ShardId{
|
||||||
|
"rack0": {0, 1}, // 2 shards
|
||||||
|
"rack1": {2}, // 1 shard
|
||||||
|
"rack2": {}, // 0 shards
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should pick rack2 (fewest shards)
|
||||||
|
picked, err := ecb.pickRackForShardType(racks, shardsPerRack, 5, make(map[string]int), nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if picked != "rack2" {
|
||||||
|
t.Errorf("expected rack2 (fewest shards), got %v", picked)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPickNodeForShardType_AntiAffinityNodes(t *testing.T) {
|
||||||
|
// Setup topology with 1 rack, 3 nodes
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
{
|
||||||
|
Id: "rack0",
|
||||||
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||||||
|
buildDataNode("node0:8080", 100),
|
||||||
|
buildDataNode("node1:8080", 100),
|
||||||
|
buildDataNode("node2:8080", 100),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
}
|
||||||
|
|
||||||
|
nodeToShardCount := make(map[string]int)
|
||||||
|
shardsPerNode := make(map[string][]erasure_coding.ShardId)
|
||||||
|
maxPerNode := 2
|
||||||
|
|
||||||
|
// Case 1: Avoid node0
|
||||||
|
antiAffinityNodes := map[string]bool{"node0:8080": true}
|
||||||
|
|
||||||
|
for i := 0; i < 20; i++ {
|
||||||
|
picked, err := ecb.pickNodeForShardType(ecNodes, shardsPerNode, maxPerNode, nodeToShardCount, antiAffinityNodes)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if picked.info.Id == "node0:8080" {
|
||||||
|
t.Errorf("picked avoided node node0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPickNodeForShardType_EdgeCases(t *testing.T) {
|
||||||
|
t.Run("NoFreeSlots", func(t *testing.T) {
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
{
|
||||||
|
Id: "rack0",
|
||||||
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||||||
|
buildDataNode("node0:8080", 0), // No capacity
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := ecb.pickNodeForShardType(ecNodes, make(map[string][]erasure_coding.ShardId), 2, make(map[string]int), nil)
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error when no free slots, got nil")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("ReplicaPlacementSameRackLimit", func(t *testing.T) {
|
||||||
|
topo := &master_pb.TopologyInfo{
|
||||||
|
Id: "test_topo",
|
||||||
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
||||||
|
{
|
||||||
|
Id: "dc1",
|
||||||
|
RackInfos: []*master_pb.RackInfo{
|
||||||
|
{
|
||||||
|
Id: "rack0",
|
||||||
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
||||||
|
buildDataNode("node0:8080", 100),
|
||||||
|
buildDataNode("node1:8080", 100),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType)
|
||||||
|
rp, _ := super_block.NewReplicaPlacementFromString("021") // SameRackCount = 1
|
||||||
|
ecb := &ecBalancer{
|
||||||
|
ecNodes: ecNodes,
|
||||||
|
diskType: types.HardDriveType,
|
||||||
|
replicaPlacement: rp,
|
||||||
|
}
|
||||||
|
|
||||||
|
nodeToShardCount := map[string]int{
|
||||||
|
"node0:8080": 3, // Exceeds SameRackCount + 1
|
||||||
|
"node1:8080": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
picked, err := ecb.pickNodeForShardType(ecNodes, make(map[string][]erasure_coding.ShardId), 5, nodeToShardCount, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if picked.info.Id != "node1:8080" {
|
||||||
|
t.Errorf("expected node1 (not at limit), got %v", picked.info.Id)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestShardsByType(t *testing.T) {
|
||||||
|
vid := needle.VolumeId(123)
|
||||||
|
|
||||||
|
// Create mock nodes with shards
|
||||||
|
nodes := []*EcNode{
|
||||||
|
{
|
||||||
|
info: &master_pb.DataNodeInfo{
|
||||||
|
Id: "node1",
|
||||||
|
DiskInfos: map[string]*master_pb.DiskInfo{
|
||||||
|
string(types.HardDriveType): {
|
||||||
|
EcShardInfos: []*master_pb.VolumeEcShardInformationMessage{
|
||||||
|
{
|
||||||
|
Id: uint32(vid),
|
||||||
|
EcIndexBits: uint32((1 << 0) | (1 << 1) | (1 << 10) | (1 << 11)), // data: 0,1 parity: 10,11
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
rack: "rack1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("Standard10Plus4", func(t *testing.T) {
|
||||||
|
dataPerRack, parityPerRack := shardsByTypePerRack(vid, nodes, types.HardDriveType, 10)
|
||||||
|
|
||||||
|
if len(dataPerRack["rack1"]) != 2 {
|
||||||
|
t.Errorf("expected 2 data shards, got %d", len(dataPerRack["rack1"]))
|
||||||
|
}
|
||||||
|
if len(parityPerRack["rack1"]) != 2 {
|
||||||
|
t.Errorf("expected 2 parity shards, got %d", len(parityPerRack["rack1"]))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("NodeGrouping", func(t *testing.T) {
|
||||||
|
dataPerNode, parityPerNode := shardsByTypePerNode(vid, nodes, types.HardDriveType, 10)
|
||||||
|
|
||||||
|
if len(dataPerNode["node1"]) != 2 {
|
||||||
|
t.Errorf("expected 2 data shards on node1, got %d", len(dataPerNode["node1"]))
|
||||||
|
}
|
||||||
|
if len(parityPerNode["node1"]) != 2 {
|
||||||
|
t.Errorf("expected 2 parity shards on node1, got %d", len(parityPerNode["node1"]))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildDataNode(nodeId string, maxVolumes int64) *master_pb.DataNodeInfo {
|
||||||
|
return &master_pb.DataNodeInfo{
|
||||||
|
Id: nodeId,
|
||||||
|
DiskInfos: map[string]*master_pb.DiskInfo{
|
||||||
|
string(types.HardDriveType): {
|
||||||
|
Type: string(types.HardDriveType),
|
||||||
|
MaxVolumeCount: maxVolumes,
|
||||||
|
VolumeCount: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -227,7 +227,8 @@ func (c *commandVolumeServerEvacuate) moveAwayOneEcVolume(commandEnv *CommandEnv
|
|||||||
}
|
}
|
||||||
vid := needle.VolumeId(ecShardInfo.Id)
|
vid := needle.VolumeId(ecShardInfo.Id)
|
||||||
// For evacuation, prefer same disk type but allow fallback to other types
|
// For evacuation, prefer same disk type but allow fallback to other types
|
||||||
destDiskId := pickBestDiskOnNode(emptyNode, vid, diskType, false)
|
// No anti-affinity needed for evacuation (dataShardCount=0)
|
||||||
|
destDiskId := pickBestDiskOnNode(emptyNode, vid, diskType, false, shardId, 0)
|
||||||
if destDiskId > 0 {
|
if destDiskId > 0 {
|
||||||
fmt.Fprintf(writer, "moving ec volume %s%d.%d %s => %s (disk %d)\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id, destDiskId)
|
fmt.Fprintf(writer, "moving ec volume %s%d.%d %s => %s (disk %d)\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id, destDiskId)
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
Reference in New Issue
Block a user