Files
seaweedFS/weed/shell/command_volume_server_evacuate.go
Chris Lu 4f038820dc Add disk-aware EC rebalancing (#7597)
* Add placement package for EC shard placement logic

- Consolidate EC shard placement algorithm for reuse across shell and worker tasks
- Support multi-pass selection: racks, then servers, then disks
- Include proper spread verification and scoring functions
- Comprehensive test coverage for various cluster topologies

* Make ec.balance disk-aware for multi-disk servers

- Add EcDisk struct to track individual disks on volume servers
- Update EcNode to maintain per-disk shard distribution
- Parse disk_id from EC shard information during topology collection
- Implement pickBestDiskOnNode() for selecting best disk per shard
- Add diskDistributionScore() for tie-breaking node selection
- Update all move operations to specify target disk in RPC calls
- Improves shard balance within multi-disk servers, not just across servers

* Use placement package in EC detection for consistent disk-level placement

- Replace custom EC disk selection logic with shared placement package
- Convert topology DiskInfo to placement.DiskCandidate format
- Use SelectDestinations() for multi-rack/server/disk spreading
- Convert placement results back to topology DiskInfo for task creation
- Ensures EC detection uses same placement logic as shell commands

* Make volume server evacuation disk-aware

- Use pickBestDiskOnNode() when selecting evacuation target disk
- Specify target disk in evacuation RPC requests
- Maintains balanced disk distribution during server evacuations

* Rename PlacementConfig to PlacementRequest for clarity

PlacementRequest better reflects that this is a request for placement
rather than a configuration object. This improves API semantics.

* Rename DefaultConfig to DefaultPlacementRequest

Aligns with the PlacementRequest type naming for consistency

* Address review comments from Gemini and CodeRabbit

Fix HIGH issues:
- Fix empty disk discovery: Now discovers all disks from VolumeInfos,
  not just from EC shards. This ensures disks without EC shards are
  still considered for placement.
- Fix EC shard count calculation in detection.go: Now correctly filters
  by DiskId and sums actual shard counts using ShardBits.ShardIdCount()
  instead of just counting EcShardInfo entries.

Fix MEDIUM issues:
- Add disk ID to evacuation log messages for consistency with other logging
- Remove unused serverToDisks variable in placement.go
- Fix comment that incorrectly said 'ascending' when sorting is 'descending'

* add ec tests

* Update ec-integration-tests.yml

* Update ec_integration_test.go

* Fix EC integration tests CI: build weed binary and update actions

- Add 'Build weed binary' step before running tests
- Update actions/setup-go from v4 to v6 (Node20 compatibility)
- Update actions/checkout from v2 to v4 (Node20 compatibility)
- Move working-directory to test step only

* Add disk-aware EC rebalancing integration tests

- Add TestDiskAwareECRebalancing test with multi-disk cluster setup
- Test EC encode with disk awareness (shows disk ID in output)
- Test EC balance with disk-level shard distribution
- Add helper functions for disk-level verification:
  - startMultiDiskCluster: 3 servers x 4 disks each
  - countShardsPerDisk: track shards per disk per server
  - calculateDiskShardVariance: measure distribution balance
- Verify no single disk is overloaded with shards
2025-12-02 12:30:15 -08:00

284 lines
9.8 KiB
Go

package shell
import (
"flag"
"fmt"
"io"
"os"
"slices"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
)
func init() {
Commands = append(Commands, &commandVolumeServerEvacuate{})
}
type commandVolumeServerEvacuate struct {
topologyInfo *master_pb.TopologyInfo
targetServer *string
volumeRack *string
}
func (c *commandVolumeServerEvacuate) Name() string {
return "volumeServer.evacuate"
}
func (c *commandVolumeServerEvacuate) Help() string {
return `move out all data on a volume server
volumeServer.evacuate -node <host:port>
This command moves all data away from the volume server.
The volumes on the volume servers will be redistributed.
Usually this is used to prepare to shutdown or upgrade the volume server.
Sometimes a volume can not be moved because there are no
good destination to meet the replication requirement.
E.g. a volume replication 001 in a cluster with 2 volume servers can not be moved.
You can use "-skipNonMoveable" to move the rest volumes.
`
}
func (c *commandVolumeServerEvacuate) HasTag(CommandTag) bool {
return false
}
func (c *commandVolumeServerEvacuate) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
vsEvacuateCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
volumeServer := vsEvacuateCommand.String("node", "", "<host>:<port> of the volume server")
c.volumeRack = vsEvacuateCommand.String("rack", "", "source rack for the volume servers")
c.targetServer = vsEvacuateCommand.String("target", "", "<host>:<port> of target volume")
skipNonMoveable := vsEvacuateCommand.Bool("skipNonMoveable", false, "skip volumes that can not be moved")
applyChange := vsEvacuateCommand.Bool("apply", false, "actually apply the changes")
// TODO: remove this alias
applyChangeAlias := vsEvacuateCommand.Bool("force", false, "actually apply the changes (alias for -apply)")
retryCount := vsEvacuateCommand.Int("retry", 0, "how many times to retry")
if err = vsEvacuateCommand.Parse(args); err != nil {
return nil
}
handleDeprecatedForceFlag(writer, vsEvacuateCommand, applyChangeAlias, applyChange)
infoAboutSimulationMode(writer, *applyChange, "-apply")
if err = commandEnv.confirmIsLocked(args); err != nil && *applyChange {
return
}
if *volumeServer == "" && *c.volumeRack == "" {
return fmt.Errorf("need to specify volume server by -node=<host>:<port> or source rack")
}
for i := 0; i < *retryCount+1; i++ {
if err = c.volumeServerEvacuate(commandEnv, *volumeServer, *skipNonMoveable, *applyChange, writer); err == nil {
return nil
}
}
return
}
func (c *commandVolumeServerEvacuate) volumeServerEvacuate(commandEnv *CommandEnv, volumeServer string, skipNonMoveable, applyChange bool, writer io.Writer) (err error) {
// 1. confirm the volume server is part of the cluster
// 2. collect all other volume servers, sort by empty slots
// 3. move to any other volume server as long as it satisfy the replication requirements
// list all the volumes
// collect topology information
c.topologyInfo, _, err = collectTopologyInfo(commandEnv, 0)
if err != nil {
return err
}
defer func() {
c.topologyInfo = nil
}()
if err := c.evacuateNormalVolumes(commandEnv, volumeServer, skipNonMoveable, applyChange, writer); err != nil {
return err
}
if err := c.evacuateEcVolumes(commandEnv, volumeServer, skipNonMoveable, applyChange, writer); err != nil {
return err
}
return nil
}
func (c *commandVolumeServerEvacuate) evacuateNormalVolumes(commandEnv *CommandEnv, volumeServer string, skipNonMoveable, applyChange bool, writer io.Writer) error {
// find this volume server
volumeServers := collectVolumeServersByDcRackNode(c.topologyInfo, "", "", "")
thisNodes, otherNodes := c.nodesOtherThan(volumeServers, volumeServer)
if len(thisNodes) == 0 {
return fmt.Errorf("%s is not found in this cluster", volumeServer)
}
// move away normal volumes
for _, thisNode := range thisNodes {
for _, diskInfo := range thisNode.info.DiskInfos {
if applyChange {
if topologyInfo, _, err := collectTopologyInfo(commandEnv, 0); err != nil {
fmt.Fprintf(writer, "update topologyInfo %v", err)
} else {
_, otherNodesNew := c.nodesOtherThan(
collectVolumeServersByDcRackNode(topologyInfo, "", "", ""), volumeServer)
if len(otherNodesNew) > 0 {
otherNodes = otherNodesNew
c.topologyInfo = topologyInfo
fmt.Fprintf(writer, "topologyInfo updated %v\n", len(otherNodes))
}
}
}
volumeReplicas, _ := collectVolumeReplicaLocations(c.topologyInfo)
for _, vol := range diskInfo.VolumeInfos {
hasMoved, err := moveAwayOneNormalVolume(commandEnv, volumeReplicas, vol, thisNode, otherNodes, applyChange)
if err != nil {
fmt.Fprintf(writer, "move away volume %d from %s: %v\n", vol.Id, volumeServer, err)
}
if !hasMoved {
if skipNonMoveable {
replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(vol.ReplicaPlacement))
fmt.Fprintf(writer, "skipping non moveable volume %d replication:%s\n", vol.Id, replicaPlacement.String())
} else {
return fmt.Errorf("failed to move volume %d from %s", vol.Id, volumeServer)
}
}
}
}
}
return nil
}
func (c *commandVolumeServerEvacuate) evacuateEcVolumes(commandEnv *CommandEnv, volumeServer string, skipNonMoveable, applyChange bool, writer io.Writer) error {
// find this ec volume server
ecNodes, _ := collectEcVolumeServersByDc(c.topologyInfo, "")
thisNodes, otherNodes := c.ecNodesOtherThan(ecNodes, volumeServer)
if len(thisNodes) == 0 {
return fmt.Errorf("%s is not found in this cluster\n", volumeServer)
}
// move away ec volumes
for _, thisNode := range thisNodes {
for _, diskInfo := range thisNode.info.DiskInfos {
for _, ecShardInfo := range diskInfo.EcShardInfos {
hasMoved, err := c.moveAwayOneEcVolume(commandEnv, ecShardInfo, thisNode, otherNodes, applyChange)
if err != nil {
fmt.Fprintf(writer, "move away volume %d from %s: %v", ecShardInfo.Id, volumeServer, err)
}
if !hasMoved {
if skipNonMoveable {
fmt.Fprintf(writer, "failed to move away ec volume %d from %s\n", ecShardInfo.Id, volumeServer)
} else {
return fmt.Errorf("failed to move away ec volume %d from %s", ecShardInfo.Id, volumeServer)
}
}
}
}
}
return nil
}
func (c *commandVolumeServerEvacuate) moveAwayOneEcVolume(commandEnv *CommandEnv, ecShardInfo *master_pb.VolumeEcShardInformationMessage, thisNode *EcNode, otherNodes []*EcNode, applyChange bool) (hasMoved bool, err error) {
for _, shardId := range erasure_coding.ShardBits(ecShardInfo.EcIndexBits).ShardIds() {
slices.SortFunc(otherNodes, func(a, b *EcNode) int {
return a.localShardIdCount(ecShardInfo.Id) - b.localShardIdCount(ecShardInfo.Id)
})
for i := 0; i < len(otherNodes); i++ {
emptyNode := otherNodes[i]
collectionPrefix := ""
if ecShardInfo.Collection != "" {
collectionPrefix = ecShardInfo.Collection + "_"
}
vid := needle.VolumeId(ecShardInfo.Id)
destDiskId := pickBestDiskOnNode(emptyNode, vid)
if destDiskId > 0 {
fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s (disk %d)\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id, destDiskId)
} else {
fmt.Fprintf(os.Stdout, "moving ec volume %s%d.%d %s => %s\n", collectionPrefix, ecShardInfo.Id, shardId, thisNode.info.Id, emptyNode.info.Id)
}
err = moveMountedShardToEcNode(commandEnv, thisNode, ecShardInfo.Collection, vid, shardId, emptyNode, destDiskId, applyChange)
if err != nil {
return
} else {
hasMoved = true
break
}
}
if !hasMoved {
return
}
}
return
}
func moveAwayOneNormalVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, vol *master_pb.VolumeInformationMessage, thisNode *Node, otherNodes []*Node, applyChange bool) (hasMoved bool, err error) {
freeVolumeCountfn := capacityByFreeVolumeCount(types.ToDiskType(vol.DiskType))
maxVolumeCountFn := capacityByMaxVolumeCount(types.ToDiskType(vol.DiskType))
for _, n := range otherNodes {
n.selectVolumes(func(v *master_pb.VolumeInformationMessage) bool {
return v.DiskType == vol.DiskType
})
}
// most empty one is in the front
slices.SortFunc(otherNodes, func(a, b *Node) int {
return int(a.localVolumeRatio(maxVolumeCountFn) - b.localVolumeRatio(maxVolumeCountFn))
})
for i := 0; i < len(otherNodes); i++ {
emptyNode := otherNodes[i]
if freeVolumeCountfn(emptyNode.info) <= 0 {
continue
}
hasMoved, err = maybeMoveOneVolume(commandEnv, volumeReplicas, thisNode, vol, emptyNode, applyChange)
if err != nil {
return
}
if hasMoved {
break
}
}
return
}
func (c *commandVolumeServerEvacuate) nodesOtherThan(volumeServers []*Node, thisServer string) (thisNodes []*Node, otherNodes []*Node) {
for _, node := range volumeServers {
if node.info.Id == thisServer || (*c.volumeRack != "" && node.rack == *c.volumeRack) {
thisNodes = append(thisNodes, node)
continue
}
if *c.volumeRack != "" && *c.volumeRack == node.rack {
continue
}
if *c.targetServer != "" && *c.targetServer != node.info.Id {
continue
}
otherNodes = append(otherNodes, node)
}
return
}
func (c *commandVolumeServerEvacuate) ecNodesOtherThan(volumeServers []*EcNode, thisServer string) (thisNodes []*EcNode, otherNodes []*EcNode) {
for _, node := range volumeServers {
if node.info.Id == thisServer || (*c.volumeRack != "" && string(node.rack) == *c.volumeRack) {
thisNodes = append(thisNodes, node)
continue
}
if *c.volumeRack != "" && *c.volumeRack == string(node.rack) {
continue
}
if *c.targetServer != "" && *c.targetServer != node.info.Id {
continue
}
otherNodes = append(otherNodes, node)
}
return
}