admin: Refactor task destination planning (#7063)
* refactor planning into task detection * refactoring worker tasks * refactor * compiles, but only balance task is registered * compiles, but has nil exception * avoid nil logger * add back ec task * setting ec log directory * implement balance and vacuum tasks * EC tasks will no longer fail with "file not found" errors * Use ReceiveFile API to send locally generated shards * distributing shard files and ecx,ecj,vif files * generate .ecx files correctly * do not mount all possible EC shards (0-13) on every destination * use constants * delete all replicas * rename files * pass in volume size to tasks
This commit is contained in:
@@ -1,141 +0,0 @@
|
||||
package balance
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
||||
)
|
||||
|
||||
// Task implements balance operation to redistribute volumes across volume servers
|
||||
type Task struct {
|
||||
*tasks.BaseTask
|
||||
server string
|
||||
volumeID uint32
|
||||
collection string
|
||||
|
||||
// Task parameters for accessing planned destinations
|
||||
taskParams types.TaskParams
|
||||
}
|
||||
|
||||
// NewTask creates a new balance task instance
|
||||
func NewTask(server string, volumeID uint32, collection string) *Task {
|
||||
task := &Task{
|
||||
BaseTask: tasks.NewBaseTask(types.TaskTypeBalance),
|
||||
server: server,
|
||||
volumeID: volumeID,
|
||||
collection: collection,
|
||||
}
|
||||
return task
|
||||
}
|
||||
|
||||
// Execute executes the balance task
|
||||
func (t *Task) Execute(params types.TaskParams) error {
|
||||
// Use BaseTask.ExecuteTask to handle logging initialization
|
||||
return t.ExecuteTask(context.Background(), params, t.executeImpl)
|
||||
}
|
||||
|
||||
// executeImpl is the actual balance implementation
|
||||
func (t *Task) executeImpl(ctx context.Context, params types.TaskParams) error {
|
||||
// Store task parameters for accessing planned destinations
|
||||
t.taskParams = params
|
||||
|
||||
// Get planned destination
|
||||
destNode := t.getPlannedDestination()
|
||||
if destNode != "" {
|
||||
t.LogWithFields("INFO", "Starting balance task with planned destination", map[string]interface{}{
|
||||
"volume_id": t.volumeID,
|
||||
"source": t.server,
|
||||
"destination": destNode,
|
||||
"collection": t.collection,
|
||||
})
|
||||
} else {
|
||||
t.LogWithFields("INFO", "Starting balance task without specific destination", map[string]interface{}{
|
||||
"volume_id": t.volumeID,
|
||||
"server": t.server,
|
||||
"collection": t.collection,
|
||||
})
|
||||
}
|
||||
|
||||
// Simulate balance operation with progress updates
|
||||
steps := []struct {
|
||||
name string
|
||||
duration time.Duration
|
||||
progress float64
|
||||
}{
|
||||
{"Analyzing cluster state", 2 * time.Second, 15},
|
||||
{"Identifying optimal placement", 3 * time.Second, 35},
|
||||
{"Moving volume data", 6 * time.Second, 75},
|
||||
{"Updating cluster metadata", 2 * time.Second, 95},
|
||||
{"Verifying balance", 1 * time.Second, 100},
|
||||
}
|
||||
|
||||
for _, step := range steps {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
t.LogWarning("Balance task cancelled during step: %s", step.name)
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
if t.IsCancelled() {
|
||||
t.LogWarning("Balance task cancelled by request during step: %s", step.name)
|
||||
return fmt.Errorf("balance task cancelled")
|
||||
}
|
||||
|
||||
t.LogWithFields("INFO", "Executing balance step", map[string]interface{}{
|
||||
"step": step.name,
|
||||
"progress": step.progress,
|
||||
"duration": step.duration.String(),
|
||||
"volume_id": t.volumeID,
|
||||
})
|
||||
t.SetProgress(step.progress)
|
||||
|
||||
// Simulate work
|
||||
time.Sleep(step.duration)
|
||||
}
|
||||
|
||||
t.LogWithFields("INFO", "Balance task completed successfully", map[string]interface{}{
|
||||
"volume_id": t.volumeID,
|
||||
"server": t.server,
|
||||
"collection": t.collection,
|
||||
"final_progress": 100.0,
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
// Validate validates the task parameters
|
||||
func (t *Task) Validate(params types.TaskParams) error {
|
||||
if params.VolumeID == 0 {
|
||||
return fmt.Errorf("volume_id is required")
|
||||
}
|
||||
if params.Server == "" {
|
||||
return fmt.Errorf("server is required")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getPlannedDestination extracts the planned destination node from task parameters
|
||||
func (t *Task) getPlannedDestination() string {
|
||||
if t.taskParams.TypedParams != nil {
|
||||
if balanceParams := t.taskParams.TypedParams.GetBalanceParams(); balanceParams != nil {
|
||||
if balanceParams.DestNode != "" {
|
||||
glog.V(2).Infof("Found planned destination for volume %d: %s", t.volumeID, balanceParams.DestNode)
|
||||
return balanceParams.DestNode
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// EstimateTime estimates the time needed for the task
|
||||
func (t *Task) EstimateTime(params types.TaskParams) time.Duration {
|
||||
// Base time for balance operation
|
||||
baseTime := 35 * time.Second
|
||||
|
||||
// Could adjust based on volume size or cluster state
|
||||
return baseTime
|
||||
}
|
||||
248
weed/worker/tasks/balance/balance_task.go
Normal file
248
weed/worker/tasks/balance/balance_task.go
Normal file
@@ -0,0 +1,248 @@
|
||||
package balance
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/operation"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
"github.com/seaweedfs/seaweedfs/weed/util"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types/base"
|
||||
"google.golang.org/grpc"
|
||||
)
|
||||
|
||||
// BalanceTask implements the Task interface
|
||||
type BalanceTask struct {
|
||||
*base.BaseTask
|
||||
server string
|
||||
volumeID uint32
|
||||
collection string
|
||||
progress float64
|
||||
}
|
||||
|
||||
// NewBalanceTask creates a new balance task instance
|
||||
func NewBalanceTask(id string, server string, volumeID uint32, collection string) *BalanceTask {
|
||||
return &BalanceTask{
|
||||
BaseTask: base.NewBaseTask(id, types.TaskTypeBalance),
|
||||
server: server,
|
||||
volumeID: volumeID,
|
||||
collection: collection,
|
||||
}
|
||||
}
|
||||
|
||||
// Execute implements the Task interface
|
||||
func (t *BalanceTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error {
|
||||
if params == nil {
|
||||
return fmt.Errorf("task parameters are required")
|
||||
}
|
||||
|
||||
balanceParams := params.GetBalanceParams()
|
||||
if balanceParams == nil {
|
||||
return fmt.Errorf("balance parameters are required")
|
||||
}
|
||||
|
||||
// Get planned destination
|
||||
destNode := balanceParams.DestNode
|
||||
|
||||
if destNode == "" {
|
||||
return fmt.Errorf("destination node is required for balance task")
|
||||
}
|
||||
|
||||
t.GetLogger().WithFields(map[string]interface{}{
|
||||
"volume_id": t.volumeID,
|
||||
"source": t.server,
|
||||
"destination": destNode,
|
||||
"collection": t.collection,
|
||||
}).Info("Starting balance task - moving volume")
|
||||
|
||||
sourceServer := pb.ServerAddress(t.server)
|
||||
targetServer := pb.ServerAddress(destNode)
|
||||
volumeId := needle.VolumeId(t.volumeID)
|
||||
|
||||
// Step 1: Mark volume readonly
|
||||
t.ReportProgress(10.0)
|
||||
t.GetLogger().Info("Marking volume readonly for move")
|
||||
if err := t.markVolumeReadonly(sourceServer, volumeId); err != nil {
|
||||
return fmt.Errorf("failed to mark volume readonly: %v", err)
|
||||
}
|
||||
|
||||
// Step 2: Copy volume to destination
|
||||
t.ReportProgress(20.0)
|
||||
t.GetLogger().Info("Copying volume to destination")
|
||||
lastAppendAtNs, err := t.copyVolume(sourceServer, targetServer, volumeId)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to copy volume: %v", err)
|
||||
}
|
||||
|
||||
// Step 3: Mount volume on target and mark it readonly
|
||||
t.ReportProgress(60.0)
|
||||
t.GetLogger().Info("Mounting volume on target server")
|
||||
if err := t.mountVolume(targetServer, volumeId); err != nil {
|
||||
return fmt.Errorf("failed to mount volume on target: %v", err)
|
||||
}
|
||||
|
||||
// Step 4: Tail for updates
|
||||
t.ReportProgress(70.0)
|
||||
t.GetLogger().Info("Syncing final updates")
|
||||
if err := t.tailVolume(sourceServer, targetServer, volumeId, lastAppendAtNs); err != nil {
|
||||
glog.Warningf("Tail operation failed (may be normal): %v", err)
|
||||
}
|
||||
|
||||
// Step 5: Unmount from source
|
||||
t.ReportProgress(85.0)
|
||||
t.GetLogger().Info("Unmounting volume from source server")
|
||||
if err := t.unmountVolume(sourceServer, volumeId); err != nil {
|
||||
return fmt.Errorf("failed to unmount volume from source: %v", err)
|
||||
}
|
||||
|
||||
// Step 6: Delete from source
|
||||
t.ReportProgress(95.0)
|
||||
t.GetLogger().Info("Deleting volume from source server")
|
||||
if err := t.deleteVolume(sourceServer, volumeId); err != nil {
|
||||
return fmt.Errorf("failed to delete volume from source: %v", err)
|
||||
}
|
||||
|
||||
t.ReportProgress(100.0)
|
||||
glog.Infof("Balance task completed successfully: volume %d moved from %s to %s",
|
||||
t.volumeID, t.server, destNode)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Validate implements the UnifiedTask interface
|
||||
func (t *BalanceTask) Validate(params *worker_pb.TaskParams) error {
|
||||
if params == nil {
|
||||
return fmt.Errorf("task parameters are required")
|
||||
}
|
||||
|
||||
balanceParams := params.GetBalanceParams()
|
||||
if balanceParams == nil {
|
||||
return fmt.Errorf("balance parameters are required")
|
||||
}
|
||||
|
||||
if params.VolumeId != t.volumeID {
|
||||
return fmt.Errorf("volume ID mismatch: expected %d, got %d", t.volumeID, params.VolumeId)
|
||||
}
|
||||
|
||||
if params.Server != t.server {
|
||||
return fmt.Errorf("source server mismatch: expected %s, got %s", t.server, params.Server)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// EstimateTime implements the UnifiedTask interface
|
||||
func (t *BalanceTask) EstimateTime(params *worker_pb.TaskParams) time.Duration {
|
||||
// Basic estimate based on simulated steps
|
||||
return 14 * time.Second // Sum of all step durations
|
||||
}
|
||||
|
||||
// GetProgress returns current progress
|
||||
func (t *BalanceTask) GetProgress() float64 {
|
||||
return t.progress
|
||||
}
|
||||
|
||||
// Helper methods for real balance operations
|
||||
|
||||
// markVolumeReadonly marks the volume readonly
|
||||
func (t *BalanceTask) markVolumeReadonly(server pb.ServerAddress, volumeId needle.VolumeId) error {
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(),
|
||||
func(client volume_server_pb.VolumeServerClient) error {
|
||||
_, err := client.VolumeMarkReadonly(context.Background(), &volume_server_pb.VolumeMarkReadonlyRequest{
|
||||
VolumeId: uint32(volumeId),
|
||||
})
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
// copyVolume copies volume from source to target server
|
||||
func (t *BalanceTask) copyVolume(sourceServer, targetServer pb.ServerAddress, volumeId needle.VolumeId) (uint64, error) {
|
||||
var lastAppendAtNs uint64
|
||||
|
||||
err := operation.WithVolumeServerClient(true, targetServer, grpc.WithInsecure(),
|
||||
func(client volume_server_pb.VolumeServerClient) error {
|
||||
stream, err := client.VolumeCopy(context.Background(), &volume_server_pb.VolumeCopyRequest{
|
||||
VolumeId: uint32(volumeId),
|
||||
SourceDataNode: string(sourceServer),
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for {
|
||||
resp, recvErr := stream.Recv()
|
||||
if recvErr != nil {
|
||||
if recvErr == io.EOF {
|
||||
break
|
||||
}
|
||||
return recvErr
|
||||
}
|
||||
|
||||
if resp.LastAppendAtNs != 0 {
|
||||
lastAppendAtNs = resp.LastAppendAtNs
|
||||
} else {
|
||||
// Report copy progress
|
||||
glog.V(1).Infof("Volume %d copy progress: %s", volumeId,
|
||||
util.BytesToHumanReadable(uint64(resp.ProcessedBytes)))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
return lastAppendAtNs, err
|
||||
}
|
||||
|
||||
// mountVolume mounts the volume on the target server
|
||||
func (t *BalanceTask) mountVolume(server pb.ServerAddress, volumeId needle.VolumeId) error {
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(),
|
||||
func(client volume_server_pb.VolumeServerClient) error {
|
||||
_, err := client.VolumeMount(context.Background(), &volume_server_pb.VolumeMountRequest{
|
||||
VolumeId: uint32(volumeId),
|
||||
})
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
// tailVolume syncs remaining updates from source to target
|
||||
func (t *BalanceTask) tailVolume(sourceServer, targetServer pb.ServerAddress, volumeId needle.VolumeId, sinceNs uint64) error {
|
||||
return operation.WithVolumeServerClient(true, targetServer, grpc.WithInsecure(),
|
||||
func(client volume_server_pb.VolumeServerClient) error {
|
||||
_, err := client.VolumeTailReceiver(context.Background(), &volume_server_pb.VolumeTailReceiverRequest{
|
||||
VolumeId: uint32(volumeId),
|
||||
SinceNs: sinceNs,
|
||||
IdleTimeoutSeconds: 60, // 1 minute timeout
|
||||
SourceVolumeServer: string(sourceServer),
|
||||
})
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
// unmountVolume unmounts the volume from the server
|
||||
func (t *BalanceTask) unmountVolume(server pb.ServerAddress, volumeId needle.VolumeId) error {
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(),
|
||||
func(client volume_server_pb.VolumeServerClient) error {
|
||||
_, err := client.VolumeUnmount(context.Background(), &volume_server_pb.VolumeUnmountRequest{
|
||||
VolumeId: uint32(volumeId),
|
||||
})
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
// deleteVolume deletes the volume from the server
|
||||
func (t *BalanceTask) deleteVolume(server pb.ServerAddress, volumeId needle.VolumeId) error {
|
||||
return operation.WithVolumeServerClient(false, server, grpc.WithInsecure(),
|
||||
func(client volume_server_pb.VolumeServerClient) error {
|
||||
_, err := client.VolumeDelete(context.Background(), &volume_server_pb.VolumeDeleteRequest{
|
||||
VolumeId: uint32(volumeId),
|
||||
OnlyEmpty: false,
|
||||
})
|
||||
return err
|
||||
})
|
||||
}
|
||||
@@ -4,7 +4,9 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
||||
)
|
||||
@@ -89,46 +91,144 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
|
||||
Priority: types.TaskPriorityNormal,
|
||||
Reason: reason,
|
||||
ScheduleAt: time.Now(),
|
||||
// TypedParams will be populated by the maintenance integration
|
||||
// with destination planning information
|
||||
}
|
||||
|
||||
// Plan destination if ActiveTopology is available
|
||||
if clusterInfo.ActiveTopology != nil {
|
||||
destinationPlan, err := planBalanceDestination(clusterInfo.ActiveTopology, selectedVolume)
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to plan balance destination for volume %d: %v", selectedVolume.VolumeID, err)
|
||||
return nil, nil // Skip this task if destination planning fails
|
||||
}
|
||||
|
||||
// Create typed parameters with destination information
|
||||
task.TypedParams = &worker_pb.TaskParams{
|
||||
VolumeId: selectedVolume.VolumeID,
|
||||
Server: selectedVolume.Server,
|
||||
Collection: selectedVolume.Collection,
|
||||
VolumeSize: selectedVolume.Size, // Store original volume size for tracking changes
|
||||
TaskParams: &worker_pb.TaskParams_BalanceParams{
|
||||
BalanceParams: &worker_pb.BalanceTaskParams{
|
||||
DestNode: destinationPlan.TargetNode,
|
||||
EstimatedSize: destinationPlan.ExpectedSize,
|
||||
PlacementScore: destinationPlan.PlacementScore,
|
||||
PlacementConflicts: destinationPlan.Conflicts,
|
||||
ForceMove: false,
|
||||
TimeoutSeconds: 600, // 10 minutes default
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
glog.V(1).Infof("Planned balance destination for volume %d: %s -> %s (score: %.2f)",
|
||||
selectedVolume.VolumeID, selectedVolume.Server, destinationPlan.TargetNode, destinationPlan.PlacementScore)
|
||||
} else {
|
||||
glog.Warningf("No ActiveTopology available for destination planning in balance detection")
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return []*types.TaskDetectionResult{task}, nil
|
||||
}
|
||||
|
||||
// Scheduling implements the scheduling logic for balance tasks
|
||||
func Scheduling(task *types.Task, runningTasks []*types.Task, availableWorkers []*types.Worker, config base.TaskConfig) bool {
|
||||
balanceConfig := config.(*Config)
|
||||
// planBalanceDestination plans the destination for a balance operation
|
||||
// This function implements destination planning logic directly in the detection phase
|
||||
func planBalanceDestination(activeTopology *topology.ActiveTopology, selectedVolume *types.VolumeHealthMetrics) (*topology.DestinationPlan, error) {
|
||||
// Get source node information from topology
|
||||
var sourceRack, sourceDC string
|
||||
|
||||
// Count running balance tasks
|
||||
runningBalanceCount := 0
|
||||
for _, runningTask := range runningTasks {
|
||||
if runningTask.Type == types.TaskTypeBalance {
|
||||
runningBalanceCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Check concurrency limit
|
||||
if runningBalanceCount >= balanceConfig.MaxConcurrent {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if we have available workers
|
||||
availableWorkerCount := 0
|
||||
for _, worker := range availableWorkers {
|
||||
for _, capability := range worker.Capabilities {
|
||||
if capability == types.TaskTypeBalance {
|
||||
availableWorkerCount++
|
||||
// Extract rack and DC from topology info
|
||||
topologyInfo := activeTopology.GetTopologyInfo()
|
||||
if topologyInfo != nil {
|
||||
for _, dc := range topologyInfo.DataCenterInfos {
|
||||
for _, rack := range dc.RackInfos {
|
||||
for _, dataNodeInfo := range rack.DataNodeInfos {
|
||||
if dataNodeInfo.Id == selectedVolume.Server {
|
||||
sourceDC = dc.Id
|
||||
sourceRack = rack.Id
|
||||
break
|
||||
}
|
||||
}
|
||||
if sourceRack != "" {
|
||||
break
|
||||
}
|
||||
}
|
||||
if sourceDC != "" {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return availableWorkerCount > 0
|
||||
// Get available disks, excluding the source node
|
||||
availableDisks := activeTopology.GetAvailableDisks(topology.TaskTypeBalance, selectedVolume.Server)
|
||||
if len(availableDisks) == 0 {
|
||||
return nil, fmt.Errorf("no available disks for balance operation")
|
||||
}
|
||||
|
||||
// Find the best destination disk based on balance criteria
|
||||
var bestDisk *topology.DiskInfo
|
||||
bestScore := -1.0
|
||||
|
||||
for _, disk := range availableDisks {
|
||||
score := calculateBalanceScore(disk, sourceRack, sourceDC, selectedVolume.Size)
|
||||
if score > bestScore {
|
||||
bestScore = score
|
||||
bestDisk = disk
|
||||
}
|
||||
}
|
||||
|
||||
if bestDisk == nil {
|
||||
return nil, fmt.Errorf("no suitable destination found for balance operation")
|
||||
}
|
||||
|
||||
return &topology.DestinationPlan{
|
||||
TargetNode: bestDisk.NodeID,
|
||||
TargetDisk: bestDisk.DiskID,
|
||||
TargetRack: bestDisk.Rack,
|
||||
TargetDC: bestDisk.DataCenter,
|
||||
ExpectedSize: selectedVolume.Size,
|
||||
PlacementScore: bestScore,
|
||||
Conflicts: checkPlacementConflicts(bestDisk, sourceRack, sourceDC),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// CreateTask creates a new balance task instance
|
||||
func CreateTask(params types.TaskParams) (types.TaskInterface, error) {
|
||||
// Create and return the balance task using existing Task type
|
||||
return NewTask(params.Server, params.VolumeID, params.Collection), nil
|
||||
// calculateBalanceScore calculates placement score for balance operations
|
||||
func calculateBalanceScore(disk *topology.DiskInfo, sourceRack, sourceDC string, volumeSize uint64) float64 {
|
||||
if disk.DiskInfo == nil {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
score := 0.0
|
||||
|
||||
// Prefer disks with lower current volume count (better for balance)
|
||||
if disk.DiskInfo.MaxVolumeCount > 0 {
|
||||
utilization := float64(disk.DiskInfo.VolumeCount) / float64(disk.DiskInfo.MaxVolumeCount)
|
||||
score += (1.0 - utilization) * 40.0 // Up to 40 points for low utilization
|
||||
}
|
||||
|
||||
// Prefer different racks for better distribution
|
||||
if disk.Rack != sourceRack {
|
||||
score += 30.0
|
||||
}
|
||||
|
||||
// Prefer different data centers for better distribution
|
||||
if disk.DataCenter != sourceDC {
|
||||
score += 20.0
|
||||
}
|
||||
|
||||
// Prefer disks with lower current load
|
||||
score += (10.0 - float64(disk.LoadCount)) // Up to 10 points for low load
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
// checkPlacementConflicts checks for placement rule conflicts
|
||||
func checkPlacementConflicts(disk *topology.DiskInfo, sourceRack, sourceDC string) []string {
|
||||
var conflicts []string
|
||||
|
||||
// For now, implement basic conflict detection
|
||||
// This could be extended with more sophisticated placement rules
|
||||
if disk.Rack == sourceRack && disk.DataCenter == sourceDC {
|
||||
conflicts = append(conflicts, "same_rack_as_source")
|
||||
}
|
||||
|
||||
return conflicts
|
||||
}
|
||||
|
||||
138
weed/worker/tasks/balance/monitoring.go
Normal file
138
weed/worker/tasks/balance/monitoring.go
Normal file
@@ -0,0 +1,138 @@
|
||||
package balance
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// BalanceMetrics contains balance-specific monitoring data
|
||||
type BalanceMetrics struct {
|
||||
// Execution metrics
|
||||
VolumesBalanced int64 `json:"volumes_balanced"`
|
||||
TotalDataTransferred int64 `json:"total_data_transferred"`
|
||||
AverageImbalance float64 `json:"average_imbalance"`
|
||||
LastBalanceTime time.Time `json:"last_balance_time"`
|
||||
|
||||
// Performance metrics
|
||||
AverageTransferSpeed float64 `json:"average_transfer_speed_mbps"`
|
||||
TotalExecutionTime int64 `json:"total_execution_time_seconds"`
|
||||
SuccessfulOperations int64 `json:"successful_operations"`
|
||||
FailedOperations int64 `json:"failed_operations"`
|
||||
|
||||
// Current task metrics
|
||||
CurrentImbalanceScore float64 `json:"current_imbalance_score"`
|
||||
PlannedDestinations int `json:"planned_destinations"`
|
||||
|
||||
mutex sync.RWMutex
|
||||
}
|
||||
|
||||
// NewBalanceMetrics creates a new balance metrics instance
|
||||
func NewBalanceMetrics() *BalanceMetrics {
|
||||
return &BalanceMetrics{
|
||||
LastBalanceTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// RecordVolumeBalanced records a successful volume balance operation
|
||||
func (m *BalanceMetrics) RecordVolumeBalanced(volumeSize int64, transferTime time.Duration) {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
|
||||
m.VolumesBalanced++
|
||||
m.TotalDataTransferred += volumeSize
|
||||
m.SuccessfulOperations++
|
||||
m.LastBalanceTime = time.Now()
|
||||
m.TotalExecutionTime += int64(transferTime.Seconds())
|
||||
|
||||
// Calculate average transfer speed (MB/s)
|
||||
if transferTime > 0 {
|
||||
speedMBps := float64(volumeSize) / (1024 * 1024) / transferTime.Seconds()
|
||||
if m.AverageTransferSpeed == 0 {
|
||||
m.AverageTransferSpeed = speedMBps
|
||||
} else {
|
||||
// Exponential moving average
|
||||
m.AverageTransferSpeed = 0.8*m.AverageTransferSpeed + 0.2*speedMBps
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RecordFailure records a failed balance operation
|
||||
func (m *BalanceMetrics) RecordFailure() {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
|
||||
m.FailedOperations++
|
||||
}
|
||||
|
||||
// UpdateImbalanceScore updates the current cluster imbalance score
|
||||
func (m *BalanceMetrics) UpdateImbalanceScore(score float64) {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
|
||||
m.CurrentImbalanceScore = score
|
||||
|
||||
// Update average imbalance with exponential moving average
|
||||
if m.AverageImbalance == 0 {
|
||||
m.AverageImbalance = score
|
||||
} else {
|
||||
m.AverageImbalance = 0.9*m.AverageImbalance + 0.1*score
|
||||
}
|
||||
}
|
||||
|
||||
// SetPlannedDestinations sets the number of planned destinations
|
||||
func (m *BalanceMetrics) SetPlannedDestinations(count int) {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
|
||||
m.PlannedDestinations = count
|
||||
}
|
||||
|
||||
// GetMetrics returns a copy of the current metrics (without the mutex)
|
||||
func (m *BalanceMetrics) GetMetrics() BalanceMetrics {
|
||||
m.mutex.RLock()
|
||||
defer m.mutex.RUnlock()
|
||||
|
||||
// Create a copy without the mutex to avoid copying lock value
|
||||
return BalanceMetrics{
|
||||
VolumesBalanced: m.VolumesBalanced,
|
||||
TotalDataTransferred: m.TotalDataTransferred,
|
||||
AverageImbalance: m.AverageImbalance,
|
||||
LastBalanceTime: m.LastBalanceTime,
|
||||
AverageTransferSpeed: m.AverageTransferSpeed,
|
||||
TotalExecutionTime: m.TotalExecutionTime,
|
||||
SuccessfulOperations: m.SuccessfulOperations,
|
||||
FailedOperations: m.FailedOperations,
|
||||
CurrentImbalanceScore: m.CurrentImbalanceScore,
|
||||
PlannedDestinations: m.PlannedDestinations,
|
||||
}
|
||||
}
|
||||
|
||||
// GetSuccessRate returns the success rate as a percentage
|
||||
func (m *BalanceMetrics) GetSuccessRate() float64 {
|
||||
m.mutex.RLock()
|
||||
defer m.mutex.RUnlock()
|
||||
|
||||
total := m.SuccessfulOperations + m.FailedOperations
|
||||
if total == 0 {
|
||||
return 100.0
|
||||
}
|
||||
return float64(m.SuccessfulOperations) / float64(total) * 100.0
|
||||
}
|
||||
|
||||
// Reset resets all metrics to zero
|
||||
func (m *BalanceMetrics) Reset() {
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
|
||||
*m = BalanceMetrics{
|
||||
LastBalanceTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// Global metrics instance for balance tasks
|
||||
var globalBalanceMetrics = NewBalanceMetrics()
|
||||
|
||||
// GetGlobalBalanceMetrics returns the global balance metrics instance
|
||||
func GetGlobalBalanceMetrics() *BalanceMetrics {
|
||||
return globalBalanceMetrics
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
||||
@@ -35,9 +36,19 @@ func RegisterBalanceTask() {
|
||||
Icon: "fas fa-balance-scale text-warning",
|
||||
Capabilities: []string{"balance", "distribution"},
|
||||
|
||||
Config: config,
|
||||
ConfigSpec: GetConfigSpec(),
|
||||
CreateTask: CreateTask,
|
||||
Config: config,
|
||||
ConfigSpec: GetConfigSpec(),
|
||||
CreateTask: func(params *worker_pb.TaskParams) (types.Task, error) {
|
||||
if params == nil {
|
||||
return nil, fmt.Errorf("task parameters are required")
|
||||
}
|
||||
return NewBalanceTask(
|
||||
fmt.Sprintf("balance-%d", params.VolumeId),
|
||||
params.Server,
|
||||
params.VolumeId,
|
||||
params.Collection,
|
||||
), nil
|
||||
},
|
||||
DetectionFunc: Detection,
|
||||
ScanInterval: 30 * time.Minute,
|
||||
SchedulingFunc: Scheduling,
|
||||
37
weed/worker/tasks/balance/scheduling.go
Normal file
37
weed/worker/tasks/balance/scheduling.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package balance
|
||||
|
||||
import (
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/base"
|
||||
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
||||
)
|
||||
|
||||
// Scheduling implements the scheduling logic for balance tasks
|
||||
func Scheduling(task *types.TaskInput, runningTasks []*types.TaskInput, availableWorkers []*types.WorkerData, config base.TaskConfig) bool {
|
||||
balanceConfig := config.(*Config)
|
||||
|
||||
// Count running balance tasks
|
||||
runningBalanceCount := 0
|
||||
for _, runningTask := range runningTasks {
|
||||
if runningTask.Type == types.TaskTypeBalance {
|
||||
runningBalanceCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Check concurrency limit
|
||||
if runningBalanceCount >= balanceConfig.MaxConcurrent {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if we have available workers
|
||||
availableWorkerCount := 0
|
||||
for _, worker := range availableWorkers {
|
||||
for _, capability := range worker.Capabilities {
|
||||
if capability == types.TaskTypeBalance {
|
||||
availableWorkerCount++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return availableWorkerCount > 0
|
||||
}
|
||||
Reference in New Issue
Block a user