Files
seaweedFS/weed/worker/tasks/vacuum/vacuum_task.go
Chris Lu b17e2b411a Add dynamic timeouts to plugin worker vacuum gRPC calls (#8593)
* add dynamic timeouts to plugin worker vacuum gRPC calls

All vacuum gRPC calls used context.Background() with no deadline,
so the plugin scheduler's execution timeout could kill a job while
a large volume compact was still in progress. Use volume-size-scaled
timeouts matching the topology vacuum approach: 3 min/GB for compact,
1 min/GB for check, commit, and cleanup.

Fixes #8591

* scale scheduler execution timeout by volume size

The scheduler's per-job execution timeout (default 240s) would kill
vacuum jobs on large volumes before they finish. Three changes:

1. Vacuum detection now includes estimated_runtime_seconds in job
   proposals, computed as 5 min/GB of volume size.

2. The scheduler checks for estimated_runtime_seconds in job
   parameters and uses it as the execution timeout when larger than
   the default — a generic mechanism any handler can use.

3. Vacuum task gRPC calls now use the passed-in ctx as parent
   instead of context.Background(), so scheduler cancellation
   propagates to in-flight RPCs.

* extend job type runtime when proposals need more time

The JobTypeMaxRuntime (default 30 min) wraps both detection and
execution. Its context is the parent of all per-job execution
contexts, so even with per-job estimated_runtime_seconds, jobCtx
would cancel everything when it expires.

After detection, scan proposals for the maximum
estimated_runtime_seconds. If any proposal needs more time than
the remaining JobTypeMaxRuntime, create a new execution context
with enough headroom. This lets large vacuum jobs complete without
being killed by the job type deadline while still respecting the
configured limit for normal-sized jobs.

* log missing volume size metric, remove dead minimum runtime guard

Add a debug log in vacuumTimeout when t.volumeSize is 0 so
operators can investigate why metrics are missing for a volume.

Remove the unreachable estimatedRuntimeSeconds < 180 check in
buildVacuumProposal — volumeSizeGB always >= 1 (due to +1 floor),
so estimatedRuntimeSeconds is always >= 300.

* cap estimated runtime and fix status check context

- Cap maxEstimatedRuntime and per-job timeout overrides to 8 hours
  to prevent unbounded timeouts from bad metrics.
- Check execCtx.Err() instead of jobCtx.Err() for status reporting,
  since dispatch runs under execCtx which may have a longer deadline.
  A successful dispatch under execCtx was misreported as "timeout"
  when jobCtx had expired.
2026-03-10 13:48:42 -07:00

270 lines
8.5 KiB
Go

package vacuum
import (
"context"
"fmt"
"io"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/operation"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
"github.com/seaweedfs/seaweedfs/weed/worker/types/base"
"google.golang.org/grpc"
)
// VacuumTask implements the Task interface
type VacuumTask struct {
*base.BaseTask
server string
volumeID uint32
collection string
garbageThreshold float64
progress float64
grpcDialOption grpc.DialOption
volumeSize uint64
}
// NewVacuumTask creates a new unified vacuum task instance
func NewVacuumTask(id string, server string, volumeID uint32, collection string, grpcDialOption grpc.DialOption) *VacuumTask {
return &VacuumTask{
BaseTask: base.NewBaseTask(id, types.TaskTypeVacuum),
server: server,
volumeID: volumeID,
collection: collection,
garbageThreshold: 0.3, // Default 30% threshold
grpcDialOption: grpcDialOption,
}
}
// Execute implements the UnifiedTask interface
func (t *VacuumTask) Execute(ctx context.Context, params *worker_pb.TaskParams) error {
if params == nil {
return fmt.Errorf("task parameters are required")
}
vacuumParams := params.GetVacuumParams()
if vacuumParams == nil {
return fmt.Errorf("vacuum parameters are required")
}
t.garbageThreshold = vacuumParams.GarbageThreshold
t.volumeSize = params.VolumeSize
t.GetLogger().WithFields(map[string]interface{}{
"volume_id": t.volumeID,
"server": t.server,
"collection": t.collection,
"garbage_threshold": t.garbageThreshold,
}).Info("Starting vacuum task")
// Step 1: Check volume status and garbage ratio
t.ReportProgress(10.0)
t.GetLogger().Info("Checking volume status")
eligible, currentGarbageRatio, err := t.checkVacuumEligibility(ctx)
if err != nil {
return fmt.Errorf("failed to check vacuum eligibility: %v", err)
}
if !eligible {
t.GetLogger().WithFields(map[string]interface{}{
"current_garbage_ratio": currentGarbageRatio,
"required_threshold": t.garbageThreshold,
}).Info("Volume does not meet vacuum criteria, skipping")
t.ReportProgress(100.0)
return nil
}
// Step 2: Perform vacuum operation
t.ReportProgress(50.0)
t.GetLogger().WithFields(map[string]interface{}{
"garbage_ratio": currentGarbageRatio,
"threshold": t.garbageThreshold,
}).Info("Performing vacuum operation")
if err := t.performVacuum(ctx); err != nil {
return fmt.Errorf("failed to perform vacuum: %v", err)
}
// Step 3: Verify vacuum results
t.ReportProgress(90.0)
t.GetLogger().Info("Verifying vacuum results")
if err := t.verifyVacuumResults(ctx); err != nil {
glog.Warningf("Vacuum verification failed: %v", err)
// Don't fail the task - vacuum operation itself succeeded
}
t.ReportProgress(100.0)
glog.Infof("Vacuum task completed successfully: volume %d from %s (garbage ratio was %.2f%%)",
t.volumeID, t.server, currentGarbageRatio*100)
return nil
}
// Validate implements the UnifiedTask interface
func (t *VacuumTask) Validate(params *worker_pb.TaskParams) error {
if params == nil {
return fmt.Errorf("task parameters are required")
}
vacuumParams := params.GetVacuumParams()
if vacuumParams == nil {
return fmt.Errorf("vacuum parameters are required")
}
if params.VolumeId != t.volumeID {
return fmt.Errorf("volume ID mismatch: expected %d, got %d", t.volumeID, params.VolumeId)
}
// Validate that at least one source matches our server
found := false
for _, source := range params.Sources {
if source.Node == t.server {
found = true
break
}
}
if !found {
return fmt.Errorf("no source matches expected server %s", t.server)
}
if vacuumParams.GarbageThreshold < 0 || vacuumParams.GarbageThreshold > 1.0 {
return fmt.Errorf("invalid garbage threshold: %f (must be between 0.0 and 1.0)", vacuumParams.GarbageThreshold)
}
return nil
}
// EstimateTime implements the UnifiedTask interface
func (t *VacuumTask) EstimateTime(params *worker_pb.TaskParams) time.Duration {
// Basic estimate based on simulated steps
return 14 * time.Second // Sum of all step durations
}
// GetProgress returns current progress
func (t *VacuumTask) GetProgress() float64 {
return t.progress
}
// vacuumTimeout returns a dynamic timeout scaled by volume size, matching the
// topology vacuum approach. base is the per-GB multiplier (e.g. 1 minute for
// check, 3 minutes for compact).
func (t *VacuumTask) vacuumTimeout(base time.Duration) time.Duration {
if t.volumeSize == 0 {
glog.V(1).Infof("volume %d has no size metric, using minimum timeout", t.volumeID)
}
sizeGB := int64(t.volumeSize/1024/1024/1024) + 1
return base * time.Duration(sizeGB)
}
// Helper methods for real vacuum operations
// checkVacuumEligibility checks if the volume meets vacuum criteria
func (t *VacuumTask) checkVacuumEligibility(ctx context.Context) (bool, float64, error) {
var garbageRatio float64
err := operation.WithVolumeServerClient(false, pb.ServerAddress(t.server), t.grpcDialOption,
func(client volume_server_pb.VolumeServerClient) error {
checkCtx, cancel := context.WithTimeout(ctx, t.vacuumTimeout(time.Minute))
defer cancel()
resp, err := client.VacuumVolumeCheck(checkCtx, &volume_server_pb.VacuumVolumeCheckRequest{
VolumeId: t.volumeID,
})
if err != nil {
return fmt.Errorf("failed to check volume vacuum status: %v", err)
}
garbageRatio = resp.GarbageRatio
return nil
})
if err != nil {
return false, 0, err
}
eligible := garbageRatio >= t.garbageThreshold
glog.V(1).Infof("Volume %d garbage ratio: %.2f%%, threshold: %.2f%%, eligible: %v",
t.volumeID, garbageRatio*100, t.garbageThreshold*100, eligible)
return eligible, garbageRatio, nil
}
// performVacuum executes the actual vacuum operation
func (t *VacuumTask) performVacuum(ctx context.Context) error {
return operation.WithVolumeServerClient(false, pb.ServerAddress(t.server), t.grpcDialOption,
func(client volume_server_pb.VolumeServerClient) error {
// Step 1: Compact the volume (3 min per GB, matching topology vacuum)
t.GetLogger().Info("Compacting volume")
compactCtx, compactCancel := context.WithTimeout(ctx, t.vacuumTimeout(3*time.Minute))
defer compactCancel()
stream, err := client.VacuumVolumeCompact(compactCtx, &volume_server_pb.VacuumVolumeCompactRequest{
VolumeId: t.volumeID,
})
if err != nil {
return fmt.Errorf("vacuum compact failed: %v", err)
}
// Read compact progress
for {
resp, recvErr := stream.Recv()
if recvErr != nil {
if recvErr == io.EOF {
break
}
return fmt.Errorf("vacuum compact stream error: %v", recvErr)
}
glog.V(2).Infof("Volume %d compact progress: %d bytes processed", t.volumeID, resp.ProcessedBytes)
}
// Step 2: Commit the vacuum (1 min per GB)
t.GetLogger().Info("Committing vacuum operation")
commitCtx, commitCancel := context.WithTimeout(ctx, t.vacuumTimeout(time.Minute))
defer commitCancel()
_, err = client.VacuumVolumeCommit(commitCtx, &volume_server_pb.VacuumVolumeCommitRequest{
VolumeId: t.volumeID,
})
if err != nil {
return fmt.Errorf("vacuum commit failed: %v", err)
}
// Step 3: Cleanup old files (1 min per GB)
t.GetLogger().Info("Cleaning up vacuum files")
cleanupCtx, cleanupCancel := context.WithTimeout(ctx, t.vacuumTimeout(time.Minute))
defer cleanupCancel()
_, err = client.VacuumVolumeCleanup(cleanupCtx, &volume_server_pb.VacuumVolumeCleanupRequest{
VolumeId: t.volumeID,
})
if err != nil {
return fmt.Errorf("vacuum cleanup failed: %v", err)
}
glog.V(1).Infof("Volume %d vacuum operation completed successfully", t.volumeID)
return nil
})
}
// verifyVacuumResults checks the volume status after vacuum
func (t *VacuumTask) verifyVacuumResults(ctx context.Context) error {
return operation.WithVolumeServerClient(false, pb.ServerAddress(t.server), t.grpcDialOption,
func(client volume_server_pb.VolumeServerClient) error {
verifyCtx, cancel := context.WithTimeout(ctx, t.vacuumTimeout(time.Minute))
defer cancel()
resp, err := client.VacuumVolumeCheck(verifyCtx, &volume_server_pb.VacuumVolumeCheckRequest{
VolumeId: t.volumeID,
})
if err != nil {
return fmt.Errorf("failed to verify vacuum results: %v", err)
}
postVacuumGarbageRatio := resp.GarbageRatio
glog.V(1).Infof("Volume %d post-vacuum garbage ratio: %.2f%%",
t.volumeID, postVacuumGarbageRatio*100)
return nil
})
}