Files
seaweedFS/weed/worker/tasks/ec_balance/ec_balance_task.go
Lars Lehtonen 3a5016bcd7 fix(weed/worker/tasks/ec_balance): non-recursive reportProgress (#8892)
* fix(weed/worker/tasks/ec_balance): non-recursive reportProgress

* fix(ec_balance): call ReportProgressWithStage and include volumeID in log

The original fix replaced infinite recursion with a glog.Infof, but
skipped the framework progress callback. This adds the missing
ReportProgressWithStage call so the admin server receives EC balance
progress, and includes volumeID in the log for disambiguation.

---------

Co-authored-by: Chris Lu <chris.lu@gmail.com>
2026-04-02 15:32:57 -07:00

226 lines
8.2 KiB
Go

package ec_balance
import (
"context"
"fmt"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/operation"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
"github.com/seaweedfs/seaweedfs/weed/worker/types/base"
"google.golang.org/grpc"
)
// ECBalanceTask implements a single EC shard move operation.
// The move sequence is: copy+mount on dest → unmount on source → delete on source.
type ECBalanceTask struct {
*base.BaseTask
volumeID uint32
collection string
grpcDialOption grpc.DialOption
progress float64
}
// NewECBalanceTask creates a new EC balance task instance
func NewECBalanceTask(id string, volumeID uint32, collection string, grpcDialOption grpc.DialOption) *ECBalanceTask {
return &ECBalanceTask{
BaseTask: base.NewBaseTask(id, types.TaskTypeECBalance),
volumeID: volumeID,
collection: collection,
grpcDialOption: grpcDialOption,
}
}
// Execute performs the EC shard move operation using the same RPC sequence
// as the shell ec.balance command's moveMountedShardToEcNode function.
func (t *ECBalanceTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error {
if params == nil {
return fmt.Errorf("task parameters are required")
}
if len(params.Sources) == 0 || len(params.Targets) == 0 {
return fmt.Errorf("sources and targets are required for EC shard move")
}
if len(params.Sources) > 1 || len(params.Targets) > 1 {
return fmt.Errorf("batch EC shard moves not supported: got %d sources and %d targets, expected 1 each", len(params.Sources), len(params.Targets))
}
source := params.Sources[0]
target := params.Targets[0]
if len(source.ShardIds) == 0 || len(target.ShardIds) == 0 {
return fmt.Errorf("shard IDs are required in sources and targets")
}
sourceAddr := pb.ServerAddress(source.Node)
targetAddr := pb.ServerAddress(target.Node)
ecParams := params.GetEcBalanceParams()
// Apply configured timeout to the context for all RPC operations
if ecParams != nil && ecParams.TimeoutSeconds > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Duration(ecParams.TimeoutSeconds)*time.Second)
defer cancel()
}
isDedupDelete := ecParams != nil && isDedupPhase(params)
glog.Infof("EC balance: moving shard(s) %v of volume %d from %s to %s",
source.ShardIds, params.VolumeId, source.Node, target.Node)
// For dedup, we only unmount+delete from source (no copy needed)
if isDedupDelete {
return t.executeDedupDelete(ctx, params.VolumeId, sourceAddr, source.ShardIds)
}
// Step 1: Copy shard to destination and mount
t.reportProgress(10.0, "Copying EC shard to destination")
if err := t.copyAndMountShard(ctx, params.VolumeId, sourceAddr, targetAddr, source.ShardIds, target.DiskId); err != nil {
return fmt.Errorf("copy and mount shard: %v", err)
}
// Step 2: Unmount shard on source
t.reportProgress(50.0, "Unmounting EC shard from source")
if err := t.unmountShard(ctx, params.VolumeId, sourceAddr, source.ShardIds); err != nil {
return fmt.Errorf("unmount shard on source: %v", err)
}
// Step 3: Delete shard from source
t.reportProgress(75.0, "Deleting EC shard from source")
if err := t.deleteShard(ctx, params.VolumeId, params.Collection, sourceAddr, source.ShardIds); err != nil {
return fmt.Errorf("delete shard on source: %v", err)
}
t.reportProgress(100.0, "EC shard move complete")
glog.Infof("EC balance: successfully moved shard(s) %v of volume %d from %s to %s",
source.ShardIds, params.VolumeId, source.Node, target.Node)
return nil
}
// executeDedupDelete removes a duplicate shard without copying
func (t *ECBalanceTask) executeDedupDelete(ctx context.Context, volumeID uint32, sourceAddr pb.ServerAddress, shardIDs []uint32) error {
t.reportProgress(25.0, "Unmounting duplicate EC shard")
if err := t.unmountShard(ctx, volumeID, sourceAddr, shardIDs); err != nil {
return fmt.Errorf("unmount duplicate shard: %v", err)
}
t.reportProgress(75.0, "Deleting duplicate EC shard")
if err := t.deleteShard(ctx, volumeID, t.collection, sourceAddr, shardIDs); err != nil {
return fmt.Errorf("delete duplicate shard: %v", err)
}
t.reportProgress(100.0, "Duplicate shard removed")
return nil
}
// copyAndMountShard copies EC shard from source to destination and mounts it
func (t *ECBalanceTask) copyAndMountShard(ctx context.Context, volumeID uint32, sourceAddr, targetAddr pb.ServerAddress, shardIDs []uint32, destDiskID uint32) error {
return operation.WithVolumeServerClient(false, targetAddr, t.grpcDialOption,
func(client volume_server_pb.VolumeServerClient) error {
// Copy shard data (if source != target)
if sourceAddr != targetAddr {
_, err := client.VolumeEcShardsCopy(ctx, &volume_server_pb.VolumeEcShardsCopyRequest{
VolumeId: volumeID,
Collection: t.collection,
ShardIds: shardIDs,
CopyEcxFile: true,
CopyEcjFile: true,
CopyVifFile: true,
SourceDataNode: string(sourceAddr),
DiskId: destDiskID,
})
if err != nil {
return fmt.Errorf("copy shard(s) %v from %s to %s: %v", shardIDs, sourceAddr, targetAddr, err)
}
}
// Mount the shard on destination
_, err := client.VolumeEcShardsMount(ctx, &volume_server_pb.VolumeEcShardsMountRequest{
VolumeId: volumeID,
Collection: t.collection,
ShardIds: shardIDs,
})
if err != nil {
return fmt.Errorf("mount shard(s) %v on %s: %v", shardIDs, targetAddr, err)
}
return nil
})
}
// unmountShard unmounts EC shards from a server
func (t *ECBalanceTask) unmountShard(ctx context.Context, volumeID uint32, addr pb.ServerAddress, shardIDs []uint32) error {
return operation.WithVolumeServerClient(false, addr, t.grpcDialOption,
func(client volume_server_pb.VolumeServerClient) error {
_, err := client.VolumeEcShardsUnmount(ctx, &volume_server_pb.VolumeEcShardsUnmountRequest{
VolumeId: volumeID,
ShardIds: shardIDs,
})
return err
})
}
// deleteShard deletes EC shards from a server
func (t *ECBalanceTask) deleteShard(ctx context.Context, volumeID uint32, collection string, addr pb.ServerAddress, shardIDs []uint32) error {
return operation.WithVolumeServerClient(false, addr, t.grpcDialOption,
func(client volume_server_pb.VolumeServerClient) error {
_, err := client.VolumeEcShardsDelete(ctx, &volume_server_pb.VolumeEcShardsDeleteRequest{
VolumeId: volumeID,
Collection: collection,
ShardIds: shardIDs,
})
return err
})
}
// Validate validates the task parameters.
// ECBalanceTask handles exactly one source→target shard move per execution.
func (t *ECBalanceTask) Validate(params *worker_pb.TaskParams) error {
if params == nil {
return fmt.Errorf("ECBalanceTask.Validate: TaskParams are required")
}
if len(params.Sources) != 1 {
return fmt.Errorf("ECBalanceTask.Validate: expected exactly 1 source, got %d", len(params.Sources))
}
if len(params.Targets) != 1 {
return fmt.Errorf("ECBalanceTask.Validate: expected exactly 1 target, got %d", len(params.Targets))
}
if len(params.Sources[0].ShardIds) == 0 {
return fmt.Errorf("ECBalanceTask.Validate: Sources[0].ShardIds is empty")
}
if len(params.Targets[0].ShardIds) == 0 {
return fmt.Errorf("ECBalanceTask.Validate: Targets[0].ShardIds is empty")
}
return nil
}
// EstimateTime estimates the time for an EC shard move
func (t *ECBalanceTask) EstimateTime(params *worker_pb.TaskParams) time.Duration {
return 30 * time.Second
}
// GetProgress returns current progress
func (t *ECBalanceTask) GetProgress() float64 {
return t.progress
}
// reportProgress updates the stored progress and reports it via the callback
func (t *ECBalanceTask) reportProgress(progress float64, stage string) {
t.progress = progress
t.ReportProgressWithStage(progress, stage)
glog.Infof("EC balance volume %d: [%.2f] %s", t.volumeID, progress, stage)
}
// isDedupPhase checks if this is a dedup-phase task (source and target are the same node)
func isDedupPhase(params *worker_pb.TaskParams) bool {
if len(params.Sources) > 0 && len(params.Targets) > 0 {
return params.Sources[0].Node == params.Targets[0].Node
}
return false
}