Files
seaweedFS/weed/admin/plugin/plugin_scheduler.go
Chris Lu 995dfc4d5d chore: remove ~50k lines of unreachable dead code (#8913)
* chore: remove unreachable dead code across the codebase

Remove ~50,000 lines of unreachable code identified by static analysis.

Major removals:
- weed/filer/redis_lua: entire unused Redis Lua filer store implementation
- weed/wdclient/net2, resource_pool: unused connection/resource pool packages
- weed/plugin/worker/lifecycle: unused lifecycle plugin worker
- weed/s3api: unused S3 policy templates, presigned URL IAM, streaming copy,
  multipart IAM, key rotation, and various SSE helper functions
- weed/mq/kafka: unused partition mapping, compression, schema, and protocol functions
- weed/mq/offset: unused SQL storage and migration code
- weed/worker: unused registry, task, and monitoring functions
- weed/query: unused SQL engine, parquet scanner, and type functions
- weed/shell: unused EC proportional rebalance functions
- weed/storage/erasure_coding/distribution: unused distribution analysis functions
- Individual unreachable functions removed from 150+ files across admin,
  credential, filer, iam, kms, mount, mq, operation, pb, s3api, server,
  shell, storage, topology, and util packages

* fix(s3): reset shared memory store in IAM test to prevent flaky failure

TestLoadIAMManagerFromConfig_EmptyConfigWithFallbackKey was flaky because
the MemoryStore credential backend is a singleton registered via init().
Earlier tests that create anonymous identities pollute the shared store,
causing LookupAnonymous() to unexpectedly return true.

Fix by calling Reset() on the memory store before the test runs.

* style: run gofmt on changed files

* fix: restore KMS functions used by integration tests

* fix(plugin): prevent panic on send to closed worker session channel

The Plugin.sendToWorker method could panic with "send on closed channel"
when a worker disconnected while a message was being sent. The race was
between streamSession.close() closing the outgoing channel and sendToWorker
writing to it concurrently.

Add a done channel to streamSession that is closed before the outgoing
channel, and check it in sendToWorker's select to safely detect closed
sessions without panicking.
2026-04-03 16:04:27 -07:00

1508 lines
41 KiB
Go

package plugin
import (
"context"
"errors"
"fmt"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
"google.golang.org/protobuf/types/known/timestamppb"
)
var (
errExecutorAtCapacity = errors.New("executor is at capacity")
errSchedulerShutdown = errors.New("scheduler shutdown")
)
const (
defaultSchedulerTick = 5 * time.Second
defaultScheduledDetectionInterval = 300 * time.Second
defaultScheduledDetectionTimeout = 45 * time.Second
defaultScheduledExecutionTimeout = 90 * time.Second
defaultScheduledJobTypeMaxRuntime = 30 * time.Minute
defaultScheduledMaxResults int32 = 1000
defaultScheduledExecutionConcurrency = 1
defaultScheduledPerWorkerConcurrency = 1
maxScheduledExecutionConcurrency = 128
defaultScheduledRetryBackoff = 5 * time.Second
defaultClusterContextTimeout = 10 * time.Second
defaultWaitingBacklogFloor = 8
defaultWaitingBacklogMultiplier = 4
maxEstimatedRuntimeCap = 8 * time.Hour
)
type schedulerPolicy struct {
DetectionInterval time.Duration
DetectionTimeout time.Duration
ExecutionTimeout time.Duration
JobTypeMaxRuntime time.Duration
RetryBackoff time.Duration
MaxResults int32
ExecutionConcurrency int
PerWorkerConcurrency int
RetryLimit int
ExecutorReserveBackoff time.Duration
}
// laneSchedulerLoop is the main scheduling goroutine for a single lane.
// Each lane runs independently with its own timing, lock scope, and wake channel.
func (r *Plugin) laneSchedulerLoop(ls *schedulerLaneState) {
defer r.wg.Done()
for {
select {
case <-r.shutdownCh:
return
default:
}
hadJobs := r.runLaneSchedulerIteration(ls)
r.recordLaneIterationComplete(ls, hadJobs)
if hadJobs {
continue
}
r.setLaneLoopState(ls, "", "sleeping")
idleSleep := LaneIdleSleep(ls.lane)
if nextRun := r.earliestLaneDetectionAt(ls.lane); !nextRun.IsZero() {
if until := time.Until(nextRun); until <= 0 {
idleSleep = 0
} else if until < idleSleep {
idleSleep = until
}
}
if idleSleep <= 0 {
continue
}
timer := time.NewTimer(idleSleep)
select {
case <-r.shutdownCh:
timer.Stop()
return
case <-ls.wakeCh:
if !timer.Stop() {
<-timer.C
}
continue
case <-timer.C:
}
}
}
// runLaneSchedulerIteration runs one scheduling pass for a single lane,
// processing only the job types assigned to that lane.
//
// For lanes that require a lock (e.g. LaneDefault), all job types are
// processed sequentially under one admin lock because their volume
// management operations share global state.
//
// For lanes that do not require a lock (e.g. LaneIceberg, LaneLifecycle),
// each job type runs independently in its own goroutine so they do not
// block each other.
func (r *Plugin) runLaneSchedulerIteration(ls *schedulerLaneState) bool {
r.expireStaleJobs(time.Now().UTC())
allJobTypes := r.registry.DetectableJobTypes()
// Filter to only job types belonging to this lane.
var jobTypes []string
for _, jt := range allJobTypes {
if JobTypeLane(jt) == ls.lane {
jobTypes = append(jobTypes, jt)
}
}
if len(jobTypes) == 0 {
r.setLaneLoopState(ls, "", "idle")
return false
}
if LaneRequiresLock(ls.lane) {
return r.runLaneSchedulerIterationLocked(ls, jobTypes)
}
return r.runLaneSchedulerIterationConcurrent(ls, jobTypes)
}
// dueJobType pairs a job type with its resolved scheduling policy.
type dueJobType struct {
jobType string
policy schedulerPolicy
}
// collectDueJobTypes loads policies for all job types in the lane and
// returns those whose detection interval has elapsed. It also returns
// the full set of active job type names for later pruning.
func (r *Plugin) collectDueJobTypes(ls *schedulerLaneState, jobTypes []string) (active map[string]struct{}, due []dueJobType) {
active = make(map[string]struct{}, len(jobTypes))
for _, jobType := range jobTypes {
active[jobType] = struct{}{}
policy, enabled, err := r.loadSchedulerPolicy(jobType)
if err != nil {
glog.Warningf("Plugin scheduler [%s] failed to load policy for %s: %v", ls.lane, jobType, err)
continue
}
if !enabled {
r.clearSchedulerJobType(jobType)
continue
}
initialDelay := time.Duration(0)
if runInfo := r.snapshotSchedulerRun(jobType); runInfo.lastRunStartedAt.IsZero() {
initialDelay = 5 * time.Second
}
if !r.markDetectionDue(jobType, policy.DetectionInterval, initialDelay) {
continue
}
due = append(due, dueJobType{jobType: jobType, policy: policy})
}
return active, due
}
// runLaneSchedulerIterationLocked processes job types sequentially under a
// single admin lock. Used by the default lane where volume management
// operations must be serialised.
func (r *Plugin) runLaneSchedulerIterationLocked(ls *schedulerLaneState, jobTypes []string) bool {
r.setLaneLoopState(ls, "", "waiting_for_lock")
lockName := fmt.Sprintf("plugin scheduler:%s", ls.lane)
releaseLock, err := r.acquireAdminLock(lockName)
if err != nil {
glog.Warningf("Plugin scheduler [%s] failed to acquire lock: %v", ls.lane, err)
r.setLaneLoopState(ls, "", "idle")
return false
}
if releaseLock != nil {
defer releaseLock()
}
active, due := r.collectDueJobTypes(ls, jobTypes)
hadJobs := false
for _, w := range due {
if r.runJobTypeIteration(w.jobType, w.policy) {
hadJobs = true
}
}
r.pruneSchedulerState(active)
r.pruneDetectorLeases(active)
r.setLaneLoopState(ls, "", "idle")
return hadJobs
}
// runLaneSchedulerIterationConcurrent processes each job type in its own
// goroutine so they run independently. Used by lanes (e.g. iceberg,
// lifecycle) whose job types do not share global state.
func (r *Plugin) runLaneSchedulerIterationConcurrent(ls *schedulerLaneState, jobTypes []string) bool {
active, due := r.collectDueJobTypes(ls, jobTypes)
r.setLaneLoopState(ls, "", "busy")
var hadJobs atomic.Bool
var wg sync.WaitGroup
for _, w := range due {
wg.Add(1)
go func(jobType string, policy schedulerPolicy) {
defer wg.Done()
if r.runJobTypeIteration(jobType, policy) {
hadJobs.Store(true)
}
}(w.jobType, w.policy)
}
wg.Wait()
r.pruneSchedulerState(active)
r.pruneDetectorLeases(active)
r.setLaneLoopState(ls, "", "idle")
return hadJobs.Load()
}
// wakeAllLanes wakes all lane scheduler goroutines.
func (r *Plugin) wakeAllLanes() {
if r == nil {
return
}
for _, ls := range r.lanes {
select {
case ls.wakeCh <- struct{}{}:
default:
}
}
}
// wakeScheduler wakes the lane that owns the given job type, or all lanes
// if no job type is specified. Kept for backward compatibility.
func (r *Plugin) wakeScheduler() {
r.wakeAllLanes()
}
func (r *Plugin) runJobTypeIteration(jobType string, policy schedulerPolicy) bool {
r.recordSchedulerRunStart(jobType)
r.clearWaitingJobQueue(jobType)
r.setSchedulerLoopStateForJobType(jobType, "detecting")
r.markJobTypeInFlight(jobType)
defer r.finishDetection(jobType)
start := time.Now().UTC()
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: "scheduled detection started",
Stage: "detecting",
OccurredAt: timeToPtr(start),
})
if skip, waitingCount, waitingThreshold := r.shouldSkipDetectionForWaitingJobs(jobType, policy); skip {
r.recordSchedulerDetectionSkip(jobType, fmt.Sprintf("waiting backlog %d reached threshold %d", waitingCount, waitingThreshold))
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection skipped: waiting backlog %d reached threshold %d", waitingCount, waitingThreshold),
Stage: "skipped_waiting_backlog",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "skipped")
return false
}
maxRuntime := policy.JobTypeMaxRuntime
if maxRuntime <= 0 {
maxRuntime = defaultScheduledJobTypeMaxRuntime
}
jobCtx, cancel := context.WithTimeout(context.Background(), maxRuntime)
defer cancel()
clusterContext, err := r.loadSchedulerClusterContext(jobCtx)
if err != nil {
r.recordSchedulerDetectionError(jobType, err)
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection aborted: %v", err),
Stage: "failed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "error")
return false
}
detectionTimeout := policy.DetectionTimeout
remaining := time.Until(start.Add(maxRuntime))
if remaining <= 0 {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: "scheduled run timed out before detection",
Stage: "timeout",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "timeout")
return false
}
if detectionTimeout <= 0 {
detectionTimeout = defaultScheduledDetectionTimeout
}
if detectionTimeout > remaining {
detectionTimeout = remaining
}
detectCtx, cancelDetect := context.WithTimeout(jobCtx, detectionTimeout)
proposals, err := r.RunDetection(detectCtx, jobType, clusterContext, policy.MaxResults)
cancelDetect()
if err != nil {
r.recordSchedulerDetectionError(jobType, err)
stage := "failed"
status := "error"
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
stage = "timeout"
status = "timeout"
}
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection failed: %v", err),
Stage: stage,
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, status)
return false
}
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection completed: %d proposal(s)", len(proposals)),
Stage: "detected",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerDetectionSuccess(jobType, len(proposals))
detected := len(proposals) > 0
filteredByActive, skippedActive := r.filterProposalsWithActiveJobs(jobType, proposals)
if skippedActive > 0 {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection skipped %d proposal(s) due to active assigned/running jobs", skippedActive),
Stage: "deduped_active_jobs",
OccurredAt: timeToPtr(time.Now().UTC()),
})
}
if len(filteredByActive) == 0 {
r.recordSchedulerRunComplete(jobType, "success")
return detected
}
filtered := r.filterScheduledProposals(filteredByActive)
if len(filtered) != len(filteredByActive) {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection deduped %d proposal(s) within this run", len(filteredByActive)-len(filtered)),
Stage: "deduped",
OccurredAt: timeToPtr(time.Now().UTC()),
})
}
if len(filtered) == 0 {
r.recordSchedulerRunComplete(jobType, "success")
return detected
}
r.setSchedulerLoopStateForJobType(jobType, "executing")
// Scan proposals for the maximum estimated_runtime_seconds so the
// execution phase gets enough time for large jobs (e.g. vacuum on
// big volumes). If any proposal needs more time than the remaining
// JobTypeMaxRuntime, extend the execution context accordingly.
var maxEstimatedRuntime time.Duration
for _, p := range filtered {
if p.Parameters != nil {
if est, ok := p.Parameters["estimated_runtime_seconds"]; ok {
if v := est.GetInt64Value(); v > 0 {
if d := time.Duration(v) * time.Second; d > maxEstimatedRuntime {
maxEstimatedRuntime = d
}
}
}
}
}
if maxEstimatedRuntime > maxEstimatedRuntimeCap {
maxEstimatedRuntime = maxEstimatedRuntimeCap
}
remaining = time.Until(start.Add(maxRuntime))
if remaining <= 0 {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: "scheduled execution skipped: job type max runtime reached",
Stage: "timeout",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "timeout")
return detected
}
// If the longest estimated job exceeds the remaining JobTypeMaxRuntime,
// create a new execution context with enough headroom instead of using
// jobCtx which would cancel too early.
execCtx := jobCtx
execCancel := context.CancelFunc(func() {})
if maxEstimatedRuntime > 0 && maxEstimatedRuntime > remaining {
execCtx, execCancel = context.WithTimeout(context.Background(), maxEstimatedRuntime)
remaining = maxEstimatedRuntime
}
defer execCancel()
execPolicy := policy
if execPolicy.ExecutionTimeout <= 0 {
execPolicy.ExecutionTimeout = defaultScheduledExecutionTimeout
}
if execPolicy.ExecutionTimeout > remaining {
execPolicy.ExecutionTimeout = remaining
}
successCount, errorCount, canceledCount := r.dispatchScheduledProposals(execCtx, jobType, filtered, clusterContext, execPolicy)
status := "success"
if execCtx.Err() != nil {
status = "timeout"
} else if errorCount > 0 || canceledCount > 0 {
status = "error"
}
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled execution finished: success=%d error=%d canceled=%d", successCount, errorCount, canceledCount),
Stage: "executed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, status)
return detected
}
func (r *Plugin) loadSchedulerPolicy(jobType string) (schedulerPolicy, bool, error) {
cfg, err := r.store.LoadJobTypeConfig(jobType)
if err != nil {
return schedulerPolicy{}, false, err
}
descriptor, err := r.store.LoadDescriptor(jobType)
if err != nil {
return schedulerPolicy{}, false, err
}
adminRuntime := deriveSchedulerAdminRuntime(cfg, descriptor)
if adminRuntime == nil {
return schedulerPolicy{}, false, nil
}
if !adminRuntime.Enabled {
return schedulerPolicy{}, false, nil
}
policy := schedulerPolicy{
DetectionInterval: durationFromSeconds(adminRuntime.DetectionIntervalSeconds, defaultScheduledDetectionInterval),
DetectionTimeout: durationFromSeconds(adminRuntime.DetectionTimeoutSeconds, defaultScheduledDetectionTimeout),
ExecutionTimeout: defaultScheduledExecutionTimeout,
JobTypeMaxRuntime: durationFromSeconds(adminRuntime.JobTypeMaxRuntimeSeconds, defaultScheduledJobTypeMaxRuntime),
RetryBackoff: durationFromSeconds(adminRuntime.RetryBackoffSeconds, defaultScheduledRetryBackoff),
MaxResults: adminRuntime.MaxJobsPerDetection,
ExecutionConcurrency: int(adminRuntime.GlobalExecutionConcurrency),
PerWorkerConcurrency: int(adminRuntime.PerWorkerExecutionConcurrency),
RetryLimit: int(adminRuntime.RetryLimit),
ExecutorReserveBackoff: 200 * time.Millisecond,
}
if policy.DetectionInterval < r.schedulerTick {
policy.DetectionInterval = r.schedulerTick
}
if policy.MaxResults <= 0 {
policy.MaxResults = defaultScheduledMaxResults
}
if policy.ExecutionConcurrency <= 0 {
policy.ExecutionConcurrency = defaultScheduledExecutionConcurrency
}
if policy.ExecutionConcurrency > maxScheduledExecutionConcurrency {
policy.ExecutionConcurrency = maxScheduledExecutionConcurrency
}
if policy.PerWorkerConcurrency <= 0 {
policy.PerWorkerConcurrency = defaultScheduledPerWorkerConcurrency
}
if policy.PerWorkerConcurrency > policy.ExecutionConcurrency {
policy.PerWorkerConcurrency = policy.ExecutionConcurrency
}
if policy.RetryLimit < 0 {
policy.RetryLimit = 0
}
if policy.JobTypeMaxRuntime <= 0 {
policy.JobTypeMaxRuntime = defaultScheduledJobTypeMaxRuntime
}
// Plugin protocol currently has only detection timeout in admin settings.
execTimeout := time.Duration(adminRuntime.DetectionTimeoutSeconds*2) * time.Second
if execTimeout < defaultScheduledExecutionTimeout {
execTimeout = defaultScheduledExecutionTimeout
}
policy.ExecutionTimeout = execTimeout
return policy, true, nil
}
func (r *Plugin) ListSchedulerStates() ([]SchedulerJobTypeState, error) {
jobTypes, err := r.ListKnownJobTypes()
if err != nil {
return nil, err
}
r.schedulerMu.Lock()
nextDetectionAt := make(map[string]time.Time, len(r.nextDetectionAt))
for jobType, nextRun := range r.nextDetectionAt {
nextDetectionAt[jobType] = nextRun
}
detectionInFlight := make(map[string]bool, len(r.detectionInFlight))
for jobType, inFlight := range r.detectionInFlight {
detectionInFlight[jobType] = inFlight
}
r.schedulerMu.Unlock()
states := make([]SchedulerJobTypeState, 0, len(jobTypes))
for _, jobTypeInfo := range jobTypes {
jobType := jobTypeInfo.JobType
state := SchedulerJobTypeState{
JobType: jobType,
Lane: string(JobTypeLane(jobType)),
DetectionInFlight: detectionInFlight[jobType],
}
if nextRun, ok := nextDetectionAt[jobType]; ok && !nextRun.IsZero() {
nextRunUTC := nextRun.UTC()
state.NextDetectionAt = &nextRunUTC
}
policy, enabled, loadErr := r.loadSchedulerPolicy(jobType)
if loadErr != nil {
state.PolicyError = loadErr.Error()
} else {
state.Enabled = enabled
if enabled {
state.DetectionIntervalSeconds = secondsFromDuration(policy.DetectionInterval)
state.DetectionTimeoutSeconds = secondsFromDuration(policy.DetectionTimeout)
state.ExecutionTimeoutSeconds = secondsFromDuration(policy.ExecutionTimeout)
state.JobTypeMaxRuntimeSeconds = secondsFromDuration(policy.JobTypeMaxRuntime)
state.MaxJobsPerDetection = policy.MaxResults
state.GlobalExecutionConcurrency = policy.ExecutionConcurrency
state.PerWorkerExecutionConcurrency = policy.PerWorkerConcurrency
state.RetryLimit = policy.RetryLimit
state.RetryBackoffSeconds = secondsFromDuration(policy.RetryBackoff)
}
}
runInfo := r.snapshotSchedulerRun(jobType)
if !runInfo.lastRunStartedAt.IsZero() {
at := runInfo.lastRunStartedAt
state.LastRunStartedAt = &at
}
if !runInfo.lastRunCompletedAt.IsZero() {
at := runInfo.lastRunCompletedAt
state.LastRunCompletedAt = &at
}
if runInfo.lastRunStatus != "" {
state.LastRunStatus = runInfo.lastRunStatus
}
leasedWorkerID := r.getDetectorLease(jobType)
if leasedWorkerID != "" {
state.DetectorWorkerID = leasedWorkerID
if worker, ok := r.registry.Get(leasedWorkerID); ok {
if capability := worker.Capabilities[jobType]; capability != nil && capability.CanDetect {
state.DetectorAvailable = true
}
}
}
if state.DetectorWorkerID == "" {
detector, detectorErr := r.registry.PickDetector(jobType)
if detectorErr == nil && detector != nil {
state.DetectorAvailable = true
state.DetectorWorkerID = detector.WorkerID
}
}
executors, executorErr := r.registry.ListExecutors(jobType)
if executorErr == nil {
state.ExecutorWorkerCount = len(executors)
}
states = append(states, state)
}
return states, nil
}
func deriveSchedulerAdminRuntime(
cfg *plugin_pb.PersistedJobTypeConfig,
descriptor *plugin_pb.JobTypeDescriptor,
) *plugin_pb.AdminRuntimeConfig {
if cfg != nil && cfg.AdminRuntime != nil {
adminConfig := *cfg.AdminRuntime
return &adminConfig
}
if descriptor == nil || descriptor.AdminRuntimeDefaults == nil {
return nil
}
defaults := descriptor.AdminRuntimeDefaults
return &plugin_pb.AdminRuntimeConfig{
Enabled: defaults.Enabled,
DetectionIntervalSeconds: defaults.DetectionIntervalSeconds,
DetectionTimeoutSeconds: defaults.DetectionTimeoutSeconds,
MaxJobsPerDetection: defaults.MaxJobsPerDetection,
GlobalExecutionConcurrency: defaults.GlobalExecutionConcurrency,
PerWorkerExecutionConcurrency: defaults.PerWorkerExecutionConcurrency,
RetryLimit: defaults.RetryLimit,
RetryBackoffSeconds: defaults.RetryBackoffSeconds,
JobTypeMaxRuntimeSeconds: defaults.JobTypeMaxRuntimeSeconds,
}
}
func (r *Plugin) markDetectionDue(jobType string, interval, initialDelay time.Duration) bool {
now := time.Now().UTC()
r.schedulerMu.Lock()
defer r.schedulerMu.Unlock()
if r.detectionInFlight[jobType] {
return false
}
nextRun, exists := r.nextDetectionAt[jobType]
if exists && now.Before(nextRun) {
return false
}
if !exists && initialDelay > 0 {
r.nextDetectionAt[jobType] = now.Add(initialDelay)
return false
}
r.nextDetectionAt[jobType] = now.Add(interval)
r.detectionInFlight[jobType] = true
return true
}
// earliestLaneDetectionAt returns the earliest next detection time among
// job types that belong to the given lane.
func (r *Plugin) earliestLaneDetectionAt(lane SchedulerLane) time.Time {
if r == nil {
return time.Time{}
}
r.schedulerMu.Lock()
defer r.schedulerMu.Unlock()
var earliest time.Time
for jobType, nextRun := range r.nextDetectionAt {
if JobTypeLane(jobType) != lane {
continue
}
if nextRun.IsZero() {
continue
}
if earliest.IsZero() || nextRun.Before(earliest) {
earliest = nextRun
}
}
return earliest
}
// earliestNextDetectionAt returns the earliest next detection time across
// all job types regardless of lane. Kept for backward compatibility and
// the global scheduler status API.
func (r *Plugin) earliestNextDetectionAt() time.Time {
if r == nil {
return time.Time{}
}
r.schedulerMu.Lock()
defer r.schedulerMu.Unlock()
var earliest time.Time
for _, nextRun := range r.nextDetectionAt {
if nextRun.IsZero() {
continue
}
if earliest.IsZero() || nextRun.Before(earliest) {
earliest = nextRun
}
}
return earliest
}
func (r *Plugin) markJobTypeInFlight(jobType string) {
r.schedulerMu.Lock()
r.detectionInFlight[jobType] = true
r.schedulerMu.Unlock()
}
func (r *Plugin) finishDetection(jobType string) {
r.schedulerMu.Lock()
delete(r.detectionInFlight, jobType)
r.schedulerMu.Unlock()
}
func (r *Plugin) pruneSchedulerState(activeJobTypes map[string]struct{}) {
r.schedulerMu.Lock()
defer r.schedulerMu.Unlock()
for jobType := range r.nextDetectionAt {
if _, ok := activeJobTypes[jobType]; !ok {
delete(r.nextDetectionAt, jobType)
delete(r.detectionInFlight, jobType)
}
}
}
func (r *Plugin) clearSchedulerJobType(jobType string) {
r.schedulerMu.Lock()
delete(r.nextDetectionAt, jobType)
delete(r.detectionInFlight, jobType)
r.schedulerMu.Unlock()
r.clearDetectorLease(jobType, "")
}
func (r *Plugin) pruneDetectorLeases(activeJobTypes map[string]struct{}) {
r.detectorLeaseMu.Lock()
defer r.detectorLeaseMu.Unlock()
for jobType := range r.detectorLeases {
if _, ok := activeJobTypes[jobType]; !ok {
delete(r.detectorLeases, jobType)
}
}
}
func (r *Plugin) loadSchedulerClusterContext(ctx context.Context) (*plugin_pb.ClusterContext, error) {
if r.clusterContextProvider == nil {
return nil, fmt.Errorf("cluster context provider is not configured")
}
if ctx == nil {
ctx = context.Background()
}
clusterCtx, cancel := context.WithTimeout(ctx, defaultClusterContextTimeout)
defer cancel()
clusterContext, err := r.clusterContextProvider(clusterCtx)
if err != nil {
return nil, err
}
if clusterContext == nil {
return nil, fmt.Errorf("cluster context provider returned nil")
}
return clusterContext, nil
}
func (r *Plugin) dispatchScheduledProposals(
ctx context.Context,
jobType string,
proposals []*plugin_pb.JobProposal,
clusterContext *plugin_pb.ClusterContext,
policy schedulerPolicy,
) (int, int, int) {
if ctx == nil {
ctx = context.Background()
}
jobQueue := make(chan *plugin_pb.JobSpec, len(proposals))
for index, proposal := range proposals {
job := buildScheduledJobSpec(jobType, proposal, index)
r.trackExecutionQueued(job)
select {
case <-r.shutdownCh:
close(jobQueue)
return 0, 0, 0
default:
jobQueue <- job
}
}
close(jobQueue)
var wg sync.WaitGroup
var statsMu sync.Mutex
successCount := 0
errorCount := 0
canceledCount := 0
workerCount := policy.ExecutionConcurrency
if workerCount < 1 {
workerCount = 1
}
for i := 0; i < workerCount; i++ {
wg.Add(1)
go func() {
defer wg.Done()
jobLoop:
for job := range jobQueue {
select {
case <-r.shutdownCh:
return
default:
}
if ctx.Err() != nil {
r.cancelQueuedJob(job, ctx.Err())
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue
}
for {
select {
case <-r.shutdownCh:
return
default:
}
if ctx.Err() != nil {
r.cancelQueuedJob(job, ctx.Err())
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue jobLoop
}
executor, release, reserveErr := r.reserveScheduledExecutor(ctx, jobType, policy)
if reserveErr != nil {
if ctx.Err() != nil {
r.cancelQueuedJob(job, ctx.Err())
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue jobLoop
}
statsMu.Lock()
errorCount++
statsMu.Unlock()
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled execution reservation failed: %v", reserveErr),
Stage: "failed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
break
}
err := r.executeScheduledJobWithExecutor(ctx, executor, job, clusterContext, policy)
release()
if errors.Is(err, errExecutorAtCapacity) {
r.trackExecutionQueued(job)
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.ExecutorReserveBackoff) {
return
}
continue
}
if err != nil {
if ctx.Err() != nil || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
r.cancelQueuedJob(job, err)
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue jobLoop
}
statsMu.Lock()
errorCount++
statsMu.Unlock()
r.appendActivity(JobActivity{
JobID: job.JobId,
JobType: job.JobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled execution failed: %v", err),
Stage: "failed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
break
}
statsMu.Lock()
successCount++
statsMu.Unlock()
break
}
}
}()
}
wg.Wait()
drainErr := ctx.Err()
if drainErr == nil {
drainErr = errSchedulerShutdown
}
for job := range jobQueue {
r.cancelQueuedJob(job, drainErr)
canceledCount++
}
return successCount, errorCount, canceledCount
}
func (r *Plugin) reserveScheduledExecutor(
ctx context.Context,
jobType string,
policy schedulerPolicy,
) (*WorkerSession, func(), error) {
if ctx == nil {
ctx = context.Background()
}
deadline := time.Now().Add(policy.ExecutionTimeout)
if policy.ExecutionTimeout <= 0 {
deadline = time.Now().Add(10 * time.Minute) // Default cap
}
if ctxDeadline, ok := ctx.Deadline(); ok && ctxDeadline.Before(deadline) {
deadline = ctxDeadline
}
for {
select {
case <-r.shutdownCh:
return nil, nil, fmt.Errorf("plugin is shutting down")
default:
}
if ctx.Err() != nil {
return nil, nil, ctx.Err()
}
if time.Now().After(deadline) {
return nil, nil, fmt.Errorf("timed out waiting for executor capacity for %s", jobType)
}
executors, err := r.registry.ListExecutors(jobType)
if err != nil {
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.ExecutorReserveBackoff) {
if ctx.Err() != nil {
return nil, nil, ctx.Err()
}
return nil, nil, fmt.Errorf("plugin is shutting down")
}
continue
}
for _, executor := range executors {
release, ok := r.tryReserveExecutorCapacity(executor, jobType, policy)
if !ok {
continue
}
return executor, release, nil
}
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.ExecutorReserveBackoff) {
if ctx.Err() != nil {
return nil, nil, ctx.Err()
}
return nil, nil, fmt.Errorf("plugin is shutting down")
}
}
}
func (r *Plugin) tryReserveExecutorCapacity(
executor *WorkerSession,
jobType string,
policy schedulerPolicy,
) (func(), bool) {
return r.tryReserveExecutorCapacityForLane(executor, jobType, policy, JobTypeLane(jobType))
}
// tryReserveExecutorCapacityForLane reserves an execution slot on the
// per-lane reservation pool so that lanes cannot starve each other.
func (r *Plugin) tryReserveExecutorCapacityForLane(
executor *WorkerSession,
jobType string,
policy schedulerPolicy,
lane SchedulerLane,
) (func(), bool) {
if executor == nil || strings.TrimSpace(executor.WorkerID) == "" {
return nil, false
}
limit := schedulerWorkerExecutionLimit(executor, jobType, policy)
if limit <= 0 {
return nil, false
}
heartbeatUsed := 0
if executor.Heartbeat != nil && executor.Heartbeat.ExecutionSlotsUsed > 0 {
heartbeatUsed = int(executor.Heartbeat.ExecutionSlotsUsed)
}
workerID := strings.TrimSpace(executor.WorkerID)
ls := r.lanes[lane]
if ls == nil {
// Fallback to global reservations if lane state is missing.
r.schedulerExecMu.Lock()
reserved := r.schedulerExecReservations[workerID]
if heartbeatUsed+reserved >= limit {
r.schedulerExecMu.Unlock()
return nil, false
}
r.schedulerExecReservations[workerID] = reserved + 1
r.schedulerExecMu.Unlock()
release := func() { r.releaseExecutorCapacity(workerID) }
return release, true
}
ls.execMu.Lock()
reserved := ls.execRes[workerID]
if heartbeatUsed+reserved >= limit {
ls.execMu.Unlock()
return nil, false
}
ls.execRes[workerID] = reserved + 1
ls.execMu.Unlock()
release := func() {
r.releaseExecutorCapacityForLane(workerID, lane)
}
return release, true
}
// releaseExecutorCapacityForLane releases a reservation from the per-lane pool.
func (r *Plugin) releaseExecutorCapacityForLane(workerID string, lane SchedulerLane) {
workerID = strings.TrimSpace(workerID)
if workerID == "" {
return
}
ls := r.lanes[lane]
if ls == nil {
r.releaseExecutorCapacity(workerID)
return
}
ls.execMu.Lock()
defer ls.execMu.Unlock()
current := ls.execRes[workerID]
if current <= 1 {
delete(ls.execRes, workerID)
return
}
ls.execRes[workerID] = current - 1
}
func (r *Plugin) releaseExecutorCapacity(workerID string) {
workerID = strings.TrimSpace(workerID)
if workerID == "" {
return
}
r.schedulerExecMu.Lock()
defer r.schedulerExecMu.Unlock()
current := r.schedulerExecReservations[workerID]
if current <= 1 {
delete(r.schedulerExecReservations, workerID)
return
}
r.schedulerExecReservations[workerID] = current - 1
}
func schedulerWorkerExecutionLimit(executor *WorkerSession, jobType string, policy schedulerPolicy) int {
limit := policy.PerWorkerConcurrency
if limit <= 0 {
limit = defaultScheduledPerWorkerConcurrency
}
if capability := executor.Capabilities[jobType]; capability != nil && capability.MaxExecutionConcurrency > 0 {
capLimit := int(capability.MaxExecutionConcurrency)
if capLimit < limit {
limit = capLimit
}
}
if executor.Heartbeat != nil && executor.Heartbeat.ExecutionSlotsTotal > 0 {
heartbeatLimit := int(executor.Heartbeat.ExecutionSlotsTotal)
if heartbeatLimit < limit {
limit = heartbeatLimit
}
}
if limit < 0 {
return 0
}
return limit
}
func (r *Plugin) executeScheduledJobWithExecutor(
ctx context.Context,
executor *WorkerSession,
job *plugin_pb.JobSpec,
clusterContext *plugin_pb.ClusterContext,
policy schedulerPolicy,
) error {
maxAttempts := policy.RetryLimit + 1
if maxAttempts < 1 {
maxAttempts = 1
}
var lastErr error
for attempt := 1; attempt <= maxAttempts; attempt++ {
select {
case <-r.shutdownCh:
return fmt.Errorf("plugin is shutting down")
default:
}
if ctx != nil && ctx.Err() != nil {
return ctx.Err()
}
parent := ctx
if parent == nil {
parent = context.Background()
}
// Use the job's estimated runtime if provided and larger than the
// default execution timeout. This lets handlers like vacuum scale
// the timeout based on volume size so large volumes are not killed.
timeout := policy.ExecutionTimeout
if job.Parameters != nil {
if est, ok := job.Parameters["estimated_runtime_seconds"]; ok {
if v := est.GetInt64Value(); v > 0 {
estimated := time.Duration(v) * time.Second
if estimated > maxEstimatedRuntimeCap {
estimated = maxEstimatedRuntimeCap
}
if estimated > timeout {
timeout = estimated
}
}
}
}
execCtx, cancel := context.WithTimeout(parent, timeout)
_, err := r.executeJobWithExecutor(execCtx, executor, job, clusterContext, int32(attempt))
cancel()
if err == nil {
return nil
}
if isExecutorAtCapacityError(err) {
return errExecutorAtCapacity
}
lastErr = err
if attempt < maxAttempts {
r.appendActivity(JobActivity{
JobID: job.JobId,
JobType: job.JobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("retrying job attempt %d/%d after error: %v", attempt, maxAttempts, err),
Stage: "retry",
OccurredAt: timeToPtr(time.Now().UTC()),
})
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.RetryBackoff) {
if ctx != nil && ctx.Err() != nil {
return ctx.Err()
}
return fmt.Errorf("plugin is shutting down")
}
}
}
if lastErr == nil {
lastErr = fmt.Errorf("execution failed without an explicit error")
}
return lastErr
}
func (r *Plugin) shouldSkipDetectionForWaitingJobs(jobType string, policy schedulerPolicy) (bool, int, int) {
waitingCount := r.countWaitingTrackedJobs(jobType)
threshold := waitingBacklogThreshold(policy)
if threshold <= 0 {
return false, waitingCount, threshold
}
return waitingCount >= threshold, waitingCount, threshold
}
func (r *Plugin) countWaitingTrackedJobs(jobType string) int {
normalizedJobType := strings.TrimSpace(jobType)
if normalizedJobType == "" {
return 0
}
waiting := 0
r.jobsMu.RLock()
for _, job := range r.jobs {
if job == nil {
continue
}
if strings.TrimSpace(job.JobType) != normalizedJobType {
continue
}
if !isWaitingTrackedJobState(job.State) {
continue
}
waiting++
}
r.jobsMu.RUnlock()
return waiting
}
func (r *Plugin) clearWaitingJobQueue(jobType string) int {
normalizedJobType := strings.TrimSpace(jobType)
if normalizedJobType == "" {
return 0
}
jobIDs := make([]string, 0)
seen := make(map[string]struct{})
r.jobsMu.RLock()
for _, job := range r.jobs {
if job == nil {
continue
}
if strings.TrimSpace(job.JobType) != normalizedJobType {
continue
}
if !isWaitingTrackedJobState(job.State) {
continue
}
jobID := strings.TrimSpace(job.JobID)
if jobID == "" {
continue
}
if _, ok := seen[jobID]; ok {
continue
}
seen[jobID] = struct{}{}
jobIDs = append(jobIDs, jobID)
}
r.jobsMu.RUnlock()
if len(jobIDs) == 0 {
return 0
}
reason := fmt.Sprintf("cleared queued job before %s run", normalizedJobType)
for _, jobID := range jobIDs {
r.markJobCanceled(&plugin_pb.JobSpec{
JobId: jobID,
JobType: normalizedJobType,
}, reason)
}
return len(jobIDs)
}
func waitingBacklogThreshold(policy schedulerPolicy) int {
concurrency := policy.ExecutionConcurrency
if concurrency <= 0 {
concurrency = defaultScheduledExecutionConcurrency
}
threshold := concurrency * defaultWaitingBacklogMultiplier
if threshold < defaultWaitingBacklogFloor {
threshold = defaultWaitingBacklogFloor
}
if policy.MaxResults > 0 && threshold > int(policy.MaxResults) {
threshold = int(policy.MaxResults)
}
return threshold
}
func isExecutorAtCapacityError(err error) bool {
if err == nil {
return false
}
if errors.Is(err, errExecutorAtCapacity) {
return true
}
return strings.Contains(strings.ToLower(err.Error()), "executor is at capacity")
}
func buildScheduledJobSpec(jobType string, proposal *plugin_pb.JobProposal, index int) *plugin_pb.JobSpec {
now := timestamppb.Now()
jobID := fmt.Sprintf("%s-scheduled-%d-%d", jobType, now.AsTime().UnixNano(), index)
job := &plugin_pb.JobSpec{
JobId: jobID,
JobType: jobType,
Priority: plugin_pb.JobPriority_JOB_PRIORITY_NORMAL,
Parameters: map[string]*plugin_pb.ConfigValue{},
Labels: map[string]string{},
CreatedAt: now,
ScheduledAt: now,
}
if proposal == nil {
return job
}
if proposal.JobType != "" {
job.JobType = proposal.JobType
}
job.Summary = proposal.Summary
job.Detail = proposal.Detail
if proposal.Priority != plugin_pb.JobPriority_JOB_PRIORITY_UNSPECIFIED {
job.Priority = proposal.Priority
}
job.DedupeKey = proposal.DedupeKey
job.Parameters = CloneConfigValueMap(proposal.Parameters)
if proposal.Labels != nil {
job.Labels = make(map[string]string, len(proposal.Labels))
for k, v := range proposal.Labels {
job.Labels[k] = v
}
}
if proposal.NotBefore != nil {
job.ScheduledAt = proposal.NotBefore
}
return job
}
func durationFromSeconds(seconds int32, defaultValue time.Duration) time.Duration {
if seconds <= 0 {
return defaultValue
}
return time.Duration(seconds) * time.Second
}
func secondsFromDuration(duration time.Duration) int32 {
if duration <= 0 {
return 0
}
return int32(duration / time.Second)
}
func waitForShutdownOrTimerWithContext(shutdown <-chan struct{}, ctx context.Context, duration time.Duration) bool {
if duration <= 0 {
return true
}
if ctx == nil {
ctx = context.Background()
}
timer := time.NewTimer(duration)
defer timer.Stop()
select {
case <-shutdown:
return false
case <-ctx.Done():
return false
case <-timer.C:
return true
}
}
// filterProposalsWithActiveJobs removes proposals whose dedupe keys already have active jobs.
// It first expires stale tracked jobs via expireStaleJobs, which can mutate scheduler state,
// so callers should treat this method as a stateful operation.
func (r *Plugin) filterProposalsWithActiveJobs(jobType string, proposals []*plugin_pb.JobProposal) ([]*plugin_pb.JobProposal, int) {
if len(proposals) == 0 {
return proposals, 0
}
r.expireStaleJobs(time.Now().UTC())
activeKeys := make(map[string]struct{})
r.jobsMu.RLock()
for _, job := range r.jobs {
if job == nil {
continue
}
if strings.TrimSpace(job.JobType) != strings.TrimSpace(jobType) {
continue
}
if !isActiveTrackedJobState(job.State) {
continue
}
key := strings.TrimSpace(job.DedupeKey)
if key == "" {
key = strings.TrimSpace(job.JobID)
}
if key == "" {
continue
}
activeKeys[key] = struct{}{}
}
r.jobsMu.RUnlock()
if len(activeKeys) == 0 {
return proposals, 0
}
filtered := make([]*plugin_pb.JobProposal, 0, len(proposals))
skipped := 0
for _, proposal := range proposals {
if proposal == nil {
continue
}
key := proposalExecutionKey(proposal)
if key != "" {
if _, exists := activeKeys[key]; exists {
skipped++
continue
}
}
filtered = append(filtered, proposal)
}
return filtered, skipped
}
func proposalExecutionKey(proposal *plugin_pb.JobProposal) string {
if proposal == nil {
return ""
}
key := strings.TrimSpace(proposal.DedupeKey)
if key != "" {
return key
}
return strings.TrimSpace(proposal.ProposalId)
}
func isActiveTrackedJobState(state string) bool {
normalized := strings.ToLower(strings.TrimSpace(state))
switch normalized {
case "pending", "assigned", "running", "in_progress", "job_state_pending", "job_state_assigned", "job_state_running":
return true
default:
return false
}
}
func isWaitingTrackedJobState(state string) bool {
normalized := strings.ToLower(strings.TrimSpace(state))
return normalized == "pending" || normalized == "job_state_pending"
}
// DispatchProposals dispatches a batch of proposals using the same capacity-aware
// dispatch logic as the scheduler loop: concurrent execution, executor reservation
// with backoff, and per-job retry on transient errors. The scheduler policy is
// loaded from the persisted job type config; if the job type has no config or is
// disabled a sensible default policy is used so manual runs always work.
func (r *Plugin) DispatchProposals(
ctx context.Context,
jobType string,
proposals []*plugin_pb.JobProposal,
clusterContext *plugin_pb.ClusterContext,
) (successCount, errorCount, canceledCount int) {
if len(proposals) == 0 {
return 0, 0, 0
}
policy, enabled, err := r.loadSchedulerPolicy(jobType)
if err != nil || !enabled {
policy = schedulerPolicy{
ExecutionConcurrency: defaultScheduledExecutionConcurrency,
PerWorkerConcurrency: defaultScheduledPerWorkerConcurrency,
ExecutionTimeout: defaultScheduledExecutionTimeout,
RetryBackoff: defaultScheduledRetryBackoff,
ExecutorReserveBackoff: 200 * time.Millisecond,
}
}
return r.dispatchScheduledProposals(ctx, jobType, proposals, clusterContext, policy)
}
func (r *Plugin) filterScheduledProposals(proposals []*plugin_pb.JobProposal) []*plugin_pb.JobProposal {
filtered := make([]*plugin_pb.JobProposal, 0, len(proposals))
seenInRun := make(map[string]struct{}, len(proposals))
for _, proposal := range proposals {
if proposal == nil {
continue
}
key := proposal.DedupeKey
if key == "" {
key = proposal.ProposalId
}
if key == "" {
filtered = append(filtered, proposal)
continue
}
if _, exists := seenInRun[key]; exists {
continue
}
seenInRun[key] = struct{}{}
filtered = append(filtered, proposal)
}
return filtered
}