* fix(weed/worker/tasks/ec_balance): non-recursive reportProgress * fix(ec_balance): call ReportProgressWithStage and include volumeID in log The original fix replaced infinite recursion with a glog.Infof, but skipped the framework progress callback. This adds the missing ReportProgressWithStage call so the admin server receives EC balance progress, and includes volumeID in the log for disambiguation. --------- Co-authored-by: Chris Lu <chris.lu@gmail.com>
226 lines
8.2 KiB
Go
226 lines
8.2 KiB
Go
package ec_balance
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/operation"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/types/base"
|
|
"google.golang.org/grpc"
|
|
)
|
|
|
|
// ECBalanceTask implements a single EC shard move operation.
|
|
// The move sequence is: copy+mount on dest → unmount on source → delete on source.
|
|
type ECBalanceTask struct {
|
|
*base.BaseTask
|
|
volumeID uint32
|
|
collection string
|
|
grpcDialOption grpc.DialOption
|
|
progress float64
|
|
}
|
|
|
|
// NewECBalanceTask creates a new EC balance task instance
|
|
func NewECBalanceTask(id string, volumeID uint32, collection string, grpcDialOption grpc.DialOption) *ECBalanceTask {
|
|
return &ECBalanceTask{
|
|
BaseTask: base.NewBaseTask(id, types.TaskTypeECBalance),
|
|
volumeID: volumeID,
|
|
collection: collection,
|
|
grpcDialOption: grpcDialOption,
|
|
}
|
|
}
|
|
|
|
// Execute performs the EC shard move operation using the same RPC sequence
|
|
// as the shell ec.balance command's moveMountedShardToEcNode function.
|
|
func (t *ECBalanceTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error {
|
|
if params == nil {
|
|
return fmt.Errorf("task parameters are required")
|
|
}
|
|
|
|
if len(params.Sources) == 0 || len(params.Targets) == 0 {
|
|
return fmt.Errorf("sources and targets are required for EC shard move")
|
|
}
|
|
if len(params.Sources) > 1 || len(params.Targets) > 1 {
|
|
return fmt.Errorf("batch EC shard moves not supported: got %d sources and %d targets, expected 1 each", len(params.Sources), len(params.Targets))
|
|
}
|
|
|
|
source := params.Sources[0]
|
|
target := params.Targets[0]
|
|
|
|
if len(source.ShardIds) == 0 || len(target.ShardIds) == 0 {
|
|
return fmt.Errorf("shard IDs are required in sources and targets")
|
|
}
|
|
|
|
sourceAddr := pb.ServerAddress(source.Node)
|
|
targetAddr := pb.ServerAddress(target.Node)
|
|
|
|
ecParams := params.GetEcBalanceParams()
|
|
|
|
// Apply configured timeout to the context for all RPC operations
|
|
if ecParams != nil && ecParams.TimeoutSeconds > 0 {
|
|
var cancel context.CancelFunc
|
|
ctx, cancel = context.WithTimeout(ctx, time.Duration(ecParams.TimeoutSeconds)*time.Second)
|
|
defer cancel()
|
|
}
|
|
|
|
isDedupDelete := ecParams != nil && isDedupPhase(params)
|
|
|
|
glog.Infof("EC balance: moving shard(s) %v of volume %d from %s to %s",
|
|
source.ShardIds, params.VolumeId, source.Node, target.Node)
|
|
|
|
// For dedup, we only unmount+delete from source (no copy needed)
|
|
if isDedupDelete {
|
|
return t.executeDedupDelete(ctx, params.VolumeId, sourceAddr, source.ShardIds)
|
|
}
|
|
|
|
// Step 1: Copy shard to destination and mount
|
|
t.reportProgress(10.0, "Copying EC shard to destination")
|
|
if err := t.copyAndMountShard(ctx, params.VolumeId, sourceAddr, targetAddr, source.ShardIds, target.DiskId); err != nil {
|
|
return fmt.Errorf("copy and mount shard: %v", err)
|
|
}
|
|
|
|
// Step 2: Unmount shard on source
|
|
t.reportProgress(50.0, "Unmounting EC shard from source")
|
|
if err := t.unmountShard(ctx, params.VolumeId, sourceAddr, source.ShardIds); err != nil {
|
|
return fmt.Errorf("unmount shard on source: %v", err)
|
|
}
|
|
|
|
// Step 3: Delete shard from source
|
|
t.reportProgress(75.0, "Deleting EC shard from source")
|
|
if err := t.deleteShard(ctx, params.VolumeId, params.Collection, sourceAddr, source.ShardIds); err != nil {
|
|
return fmt.Errorf("delete shard on source: %v", err)
|
|
}
|
|
|
|
t.reportProgress(100.0, "EC shard move complete")
|
|
glog.Infof("EC balance: successfully moved shard(s) %v of volume %d from %s to %s",
|
|
source.ShardIds, params.VolumeId, source.Node, target.Node)
|
|
return nil
|
|
}
|
|
|
|
// executeDedupDelete removes a duplicate shard without copying
|
|
func (t *ECBalanceTask) executeDedupDelete(ctx context.Context, volumeID uint32, sourceAddr pb.ServerAddress, shardIDs []uint32) error {
|
|
t.reportProgress(25.0, "Unmounting duplicate EC shard")
|
|
if err := t.unmountShard(ctx, volumeID, sourceAddr, shardIDs); err != nil {
|
|
return fmt.Errorf("unmount duplicate shard: %v", err)
|
|
}
|
|
|
|
t.reportProgress(75.0, "Deleting duplicate EC shard")
|
|
if err := t.deleteShard(ctx, volumeID, t.collection, sourceAddr, shardIDs); err != nil {
|
|
return fmt.Errorf("delete duplicate shard: %v", err)
|
|
}
|
|
|
|
t.reportProgress(100.0, "Duplicate shard removed")
|
|
return nil
|
|
}
|
|
|
|
// copyAndMountShard copies EC shard from source to destination and mounts it
|
|
func (t *ECBalanceTask) copyAndMountShard(ctx context.Context, volumeID uint32, sourceAddr, targetAddr pb.ServerAddress, shardIDs []uint32, destDiskID uint32) error {
|
|
return operation.WithVolumeServerClient(false, targetAddr, t.grpcDialOption,
|
|
func(client volume_server_pb.VolumeServerClient) error {
|
|
// Copy shard data (if source != target)
|
|
if sourceAddr != targetAddr {
|
|
_, err := client.VolumeEcShardsCopy(ctx, &volume_server_pb.VolumeEcShardsCopyRequest{
|
|
VolumeId: volumeID,
|
|
Collection: t.collection,
|
|
ShardIds: shardIDs,
|
|
CopyEcxFile: true,
|
|
CopyEcjFile: true,
|
|
CopyVifFile: true,
|
|
SourceDataNode: string(sourceAddr),
|
|
DiskId: destDiskID,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("copy shard(s) %v from %s to %s: %v", shardIDs, sourceAddr, targetAddr, err)
|
|
}
|
|
}
|
|
|
|
// Mount the shard on destination
|
|
_, err := client.VolumeEcShardsMount(ctx, &volume_server_pb.VolumeEcShardsMountRequest{
|
|
VolumeId: volumeID,
|
|
Collection: t.collection,
|
|
ShardIds: shardIDs,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("mount shard(s) %v on %s: %v", shardIDs, targetAddr, err)
|
|
}
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// unmountShard unmounts EC shards from a server
|
|
func (t *ECBalanceTask) unmountShard(ctx context.Context, volumeID uint32, addr pb.ServerAddress, shardIDs []uint32) error {
|
|
return operation.WithVolumeServerClient(false, addr, t.grpcDialOption,
|
|
func(client volume_server_pb.VolumeServerClient) error {
|
|
_, err := client.VolumeEcShardsUnmount(ctx, &volume_server_pb.VolumeEcShardsUnmountRequest{
|
|
VolumeId: volumeID,
|
|
ShardIds: shardIDs,
|
|
})
|
|
return err
|
|
})
|
|
}
|
|
|
|
// deleteShard deletes EC shards from a server
|
|
func (t *ECBalanceTask) deleteShard(ctx context.Context, volumeID uint32, collection string, addr pb.ServerAddress, shardIDs []uint32) error {
|
|
return operation.WithVolumeServerClient(false, addr, t.grpcDialOption,
|
|
func(client volume_server_pb.VolumeServerClient) error {
|
|
_, err := client.VolumeEcShardsDelete(ctx, &volume_server_pb.VolumeEcShardsDeleteRequest{
|
|
VolumeId: volumeID,
|
|
Collection: collection,
|
|
ShardIds: shardIDs,
|
|
})
|
|
return err
|
|
})
|
|
}
|
|
|
|
// Validate validates the task parameters.
|
|
// ECBalanceTask handles exactly one source→target shard move per execution.
|
|
func (t *ECBalanceTask) Validate(params *worker_pb.TaskParams) error {
|
|
if params == nil {
|
|
return fmt.Errorf("ECBalanceTask.Validate: TaskParams are required")
|
|
}
|
|
if len(params.Sources) != 1 {
|
|
return fmt.Errorf("ECBalanceTask.Validate: expected exactly 1 source, got %d", len(params.Sources))
|
|
}
|
|
if len(params.Targets) != 1 {
|
|
return fmt.Errorf("ECBalanceTask.Validate: expected exactly 1 target, got %d", len(params.Targets))
|
|
}
|
|
if len(params.Sources[0].ShardIds) == 0 {
|
|
return fmt.Errorf("ECBalanceTask.Validate: Sources[0].ShardIds is empty")
|
|
}
|
|
if len(params.Targets[0].ShardIds) == 0 {
|
|
return fmt.Errorf("ECBalanceTask.Validate: Targets[0].ShardIds is empty")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// EstimateTime estimates the time for an EC shard move
|
|
func (t *ECBalanceTask) EstimateTime(params *worker_pb.TaskParams) time.Duration {
|
|
return 30 * time.Second
|
|
}
|
|
|
|
// GetProgress returns current progress
|
|
func (t *ECBalanceTask) GetProgress() float64 {
|
|
return t.progress
|
|
}
|
|
|
|
// reportProgress updates the stored progress and reports it via the callback
|
|
func (t *ECBalanceTask) reportProgress(progress float64, stage string) {
|
|
t.progress = progress
|
|
t.ReportProgressWithStage(progress, stage)
|
|
glog.Infof("EC balance volume %d: [%.2f] %s", t.volumeID, progress, stage)
|
|
}
|
|
|
|
// isDedupPhase checks if this is a dedup-phase task (source and target are the same node)
|
|
func isDedupPhase(params *worker_pb.TaskParams) bool {
|
|
if len(params.Sources) > 0 && len(params.Targets) > 0 {
|
|
return params.Sources[0].Node == params.Targets[0].Node
|
|
}
|
|
return false
|
|
}
|