Add dynamic timeouts to plugin worker vacuum gRPC calls (#8593)

* add dynamic timeouts to plugin worker vacuum gRPC calls All vacuum gRPC calls used context.Background() with no deadline, so the plugin scheduler's execution timeout could kill a job while a large volume compact was still in progress. Use volume-size-scaled timeouts matching the topology vacuum approach: 3 min/GB for compact, 1 min/GB for check, commit, and cleanup. Fixes #8591 * scale scheduler execution timeout by volume size The scheduler's per-job execution timeout (default 240s) would kill vacuum jobs on large volumes before they finish. Three changes: 1. Vacuum detection now includes estimated_runtime_seconds in job proposals, computed as 5 min/GB of volume size. 2. The scheduler checks for estimated_runtime_seconds in job parameters and uses it as the execution timeout when larger than the default — a generic mechanism any handler can use. 3. Vacuum task gRPC calls now use the passed-in ctx as parent instead of context.Background(), so scheduler cancellation propagates to in-flight RPCs. * extend job type runtime when proposals need more time The JobTypeMaxRuntime (default 30 min) wraps both detection and execution. Its context is the parent of all per-job execution contexts, so even with per-job estimated_runtime_seconds, jobCtx would cancel everything when it expires. After detection, scan proposals for the maximum estimated_runtime_seconds. If any proposal needs more time than the remaining JobTypeMaxRuntime, create a new execution context with enough headroom. This lets large vacuum jobs complete without being killed by the job type deadline while still respecting the configured limit for normal-sized jobs. * log missing volume size metric, remove dead minimum runtime guard Add a debug log in vacuumTimeout when t.volumeSize is 0 so operators can investigate why metrics are missing for a volume. Remove the unreachable estimatedRuntimeSeconds < 180 check in buildVacuumProposal — volumeSizeGB always >= 1 (due to +1 floor), so estimatedRuntimeSeconds is always >= 300. * cap estimated runtime and fix status check context - Cap maxEstimatedRuntime and per-job timeout overrides to 8 hours to prevent unbounded timeouts from bad metrics. - Check execCtx.Err() instead of jobCtx.Err() for status reporting, since dispatch runs under execCtx which may have a longer deadline. A successful dispatch under execCtx was misreported as "timeout" when jobCtx had expired.
2026-03-10 13:48:42 -07:00
parent 4c88fbfd5e
commit b17e2b411a
3 changed files with 96 additions and 17 deletions
--- a/weed/admin/plugin/plugin_scheduler.go
+++ b/weed/admin/plugin/plugin_scheduler.go
@@ -32,6 +32,7 @@ const (
 	defaultClusterContextTimeout               = 10 * time.Second
 	defaultWaitingBacklogFloor                 = 8
 	defaultWaitingBacklogMultiplier            = 4
+	maxEstimatedRuntimeCap                     = 8 * time.Hour
 )

 type schedulerPolicy struct {
@@ -293,6 +294,26 @@ func (r *Plugin) runJobTypeIteration(jobType string, policy schedulerPolicy) boo

 	r.setSchedulerLoopState(jobType, "executing")

+	// Scan proposals for the maximum estimated_runtime_seconds so the
+	// execution phase gets enough time for large jobs (e.g. vacuum on
+	// big volumes). If any proposal needs more time than the remaining
+	// JobTypeMaxRuntime, extend the execution context accordingly.
+	var maxEstimatedRuntime time.Duration
+	for _, p := range filtered {
+		if p.Parameters != nil {
+			if est, ok := p.Parameters["estimated_runtime_seconds"]; ok {
+				if v := est.GetInt64Value(); v > 0 {
+					if d := time.Duration(v) * time.Second; d > maxEstimatedRuntime {
+						maxEstimatedRuntime = d
+					}
+				}
+			}
+		}
+	}
+	if maxEstimatedRuntime > maxEstimatedRuntimeCap {
+		maxEstimatedRuntime = maxEstimatedRuntimeCap
+	}
+
 	remaining = time.Until(start.Add(maxRuntime))
 	if remaining <= 0 {
 		r.appendActivity(JobActivity{
@@ -306,6 +327,17 @@ func (r *Plugin) runJobTypeIteration(jobType string, policy schedulerPolicy) boo
 		return detected
 	}

+	// If the longest estimated job exceeds the remaining JobTypeMaxRuntime,
+	// create a new execution context with enough headroom instead of using
+	// jobCtx which would cancel too early.
+	execCtx := jobCtx
+	execCancel := context.CancelFunc(func() {})
+	if maxEstimatedRuntime > 0 && maxEstimatedRuntime > remaining {
+		execCtx, execCancel = context.WithTimeout(context.Background(), maxEstimatedRuntime)
+		remaining = maxEstimatedRuntime
+	}
+	defer execCancel()
+
 	execPolicy := policy
 	if execPolicy.ExecutionTimeout <= 0 {
 		execPolicy.ExecutionTimeout = defaultScheduledExecutionTimeout
@@ -314,10 +346,10 @@ func (r *Plugin) runJobTypeIteration(jobType string, policy schedulerPolicy) boo
 		execPolicy.ExecutionTimeout = remaining
 	}

-	successCount, errorCount, canceledCount := r.dispatchScheduledProposals(jobCtx, jobType, filtered, clusterContext, execPolicy)
+	successCount, errorCount, canceledCount := r.dispatchScheduledProposals(execCtx, jobType, filtered, clusterContext, execPolicy)

 	status := "success"
-	if jobCtx.Err() != nil {
+	if execCtx.Err() != nil {
 		status = "timeout"
 	} else if errorCount > 0 || canceledCount > 0 {
 		status = "error"
@@ -937,7 +969,24 @@ func (r *Plugin) executeScheduledJobWithExecutor(
 		if parent == nil {
 			parent = context.Background()
 		}
-		execCtx, cancel := context.WithTimeout(parent, policy.ExecutionTimeout)
+		// Use the job's estimated runtime if provided and larger than the
+		// default execution timeout. This lets handlers like vacuum scale
+		// the timeout based on volume size so large volumes are not killed.
+		timeout := policy.ExecutionTimeout
+		if job.Parameters != nil {
+			if est, ok := job.Parameters["estimated_runtime_seconds"]; ok {
+				if v := est.GetInt64Value(); v > 0 {
+					estimated := time.Duration(v) * time.Second
+					if estimated > maxEstimatedRuntimeCap {
+						estimated = maxEstimatedRuntimeCap
+					}
+					if estimated > timeout {
+						timeout = estimated
+					}
+				}
+			}
+		}
+		execCtx, cancel := context.WithTimeout(parent, timeout)
 		_, err := r.executeJobWithExecutor(execCtx, executor, job, clusterContext, int32(attempt))
 		cancel()
 		if err == nil {