Plugin scheduler: sequential iterations with max runtime (#8496)

* pb: add job type max runtime setting

* plugin: default job type max runtime

* plugin: redesign scheduler loop

* admin ui: update scheduler settings

* plugin: fix scheduler loop state name

* plugin scheduler: restore backlog skip

* plugin scheduler: drop legacy detection helper

* admin api: require scheduler config body

* admin ui: preserve detection interval on save

* plugin scheduler: use job context and drain cancels

* plugin scheduler: respect detection intervals

* plugin scheduler: gate runs and drain queue

* ec test: reuse req/resp vars

* ec test: add scheduler debug logs

* Adjust scheduler idle sleep and initial run delay

* Clear pending job queue before scheduler runs

* Log next detection time in EC integration test

* Improve plugin scheduler debug logging in EC test

* Expose scheduler next detection time

* Log scheduler next detection time in EC test

* Wake scheduler on config or worker updates

* Expose scheduler sleep interval in UI

* Fix scheduler sleep save value selection

* Set scheduler idle sleep default to 613s

* Show scheduler next run time in plugin UI

---------

Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Chris Lu
2026-03-03 23:09:49 -08:00
committed by GitHub
parent e1e5b4a8a6
commit 18ccc9b773
19 changed files with 1241 additions and 191 deletions

View File

@@ -30,6 +30,7 @@ const (
runsJSONFileName = "runs.json"
trackedJobsJSONFileName = "tracked_jobs.json"
activitiesJSONFileName = "activities.json"
schedulerJSONFileName = "scheduler.json"
defaultDirPerm = 0o755
defaultFilePerm = 0o644
)
@@ -53,6 +54,7 @@ type ConfigStore struct {
memTrackedJobs []TrackedJob
memActivities []JobActivity
memJobDetails map[string]TrackedJob
memScheduler *SchedulerConfig
}
func NewConfigStore(adminDataDir string) (*ConfigStore, error) {
@@ -93,6 +95,60 @@ func (s *ConfigStore) BaseDir() string {
return s.baseDir
}
func (s *ConfigStore) LoadSchedulerConfig() (*SchedulerConfig, error) {
s.mu.RLock()
if !s.configured {
cfg := s.memScheduler
s.mu.RUnlock()
if cfg == nil {
return nil, nil
}
clone := *cfg
return &clone, nil
}
s.mu.RUnlock()
path := filepath.Join(s.baseDir, schedulerJSONFileName)
data, err := os.ReadFile(path)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("read scheduler config: %w", err)
}
var cfg SchedulerConfig
if err := json.Unmarshal(data, &cfg); err != nil {
return nil, fmt.Errorf("unmarshal scheduler config: %w", err)
}
return &cfg, nil
}
func (s *ConfigStore) SaveSchedulerConfig(config *SchedulerConfig) error {
if config == nil {
return fmt.Errorf("scheduler config is nil")
}
normalized := normalizeSchedulerConfig(*config)
s.mu.Lock()
if !s.configured {
s.memScheduler = &normalized
s.mu.Unlock()
return nil
}
s.mu.Unlock()
payload, err := json.MarshalIndent(normalized, "", " ")
if err != nil {
return fmt.Errorf("marshal scheduler config: %w", err)
}
path := filepath.Join(s.baseDir, schedulerJSONFileName)
if err := os.WriteFile(path, payload, defaultFilePerm); err != nil {
return fmt.Errorf("save scheduler config: %w", err)
}
return nil
}
func (s *ConfigStore) SaveDescriptor(jobType string, descriptor *plugin_pb.JobTypeDescriptor) error {
if descriptor == nil {
return fmt.Errorf("descriptor is nil")

View File

@@ -68,6 +68,13 @@ type Plugin struct {
adminScriptRunMu sync.RWMutex
schedulerDetectionMu sync.Mutex
schedulerDetection map[string]*schedulerDetectionInfo
schedulerRunMu sync.Mutex
schedulerRun map[string]*schedulerRunInfo
schedulerLoopMu sync.Mutex
schedulerLoopState schedulerLoopState
schedulerConfigMu sync.RWMutex
schedulerConfig SchedulerConfig
schedulerWakeCh chan struct{}
dedupeMu sync.Mutex
recentDedupeByType map[string]map[string]time.Time
@@ -164,14 +171,31 @@ func New(options Options) (*Plugin, error) {
detectorLeases: make(map[string]string),
schedulerExecReservations: make(map[string]int),
schedulerDetection: make(map[string]*schedulerDetectionInfo),
schedulerRun: make(map[string]*schedulerRunInfo),
recentDedupeByType: make(map[string]map[string]time.Time),
jobs: make(map[string]*TrackedJob),
activities: make([]JobActivity, 0, 256),
persistTicker: time.NewTicker(2 * time.Second),
schedulerWakeCh: make(chan struct{}, 1),
shutdownCh: make(chan struct{}),
}
plugin.ctx, plugin.ctxCancel = context.WithCancel(context.Background())
if cfg, err := plugin.store.LoadSchedulerConfig(); err != nil {
glog.Warningf("Plugin failed to load scheduler config: %v", err)
plugin.schedulerConfig = DefaultSchedulerConfig()
} else if cfg == nil {
defaults := DefaultSchedulerConfig()
plugin.schedulerConfig = defaults
if plugin.store.IsConfigured() {
if err := plugin.store.SaveSchedulerConfig(&defaults); err != nil {
glog.Warningf("Plugin failed to persist scheduler defaults: %v", err)
}
}
} else {
plugin.schedulerConfig = normalizeSchedulerConfig(*cfg)
}
if err := plugin.loadPersistedMonitorState(); err != nil {
glog.Warningf("Plugin failed to load persisted monitoring state: %v", err)
}
@@ -371,7 +395,11 @@ func (r *Plugin) LoadJobTypeConfig(jobType string) (*plugin_pb.PersistedJobTypeC
}
func (r *Plugin) SaveJobTypeConfig(config *plugin_pb.PersistedJobTypeConfig) error {
return r.store.SaveJobTypeConfig(config)
if err := r.store.SaveJobTypeConfig(config); err != nil {
return err
}
r.wakeScheduler()
return nil
}
func (r *Plugin) LoadDescriptor(jobType string) (*plugin_pb.JobTypeDescriptor, error) {
@@ -390,6 +418,31 @@ func (r *Plugin) BaseDir() string {
return r.store.BaseDir()
}
func (r *Plugin) GetSchedulerConfig() SchedulerConfig {
if r == nil {
return DefaultSchedulerConfig()
}
r.schedulerConfigMu.RLock()
cfg := r.schedulerConfig
r.schedulerConfigMu.RUnlock()
return normalizeSchedulerConfig(cfg)
}
func (r *Plugin) UpdateSchedulerConfig(cfg SchedulerConfig) (SchedulerConfig, error) {
if r == nil {
return DefaultSchedulerConfig(), fmt.Errorf("plugin is not initialized")
}
normalized := normalizeSchedulerConfig(cfg)
if err := r.store.SaveSchedulerConfig(&normalized); err != nil {
return SchedulerConfig{}, err
}
r.schedulerConfigMu.Lock()
r.schedulerConfig = normalized
r.schedulerConfigMu.Unlock()
r.wakeScheduler()
return normalized, nil
}
func (r *Plugin) acquireAdminLock(reason string) (func(), error) {
if r == nil || r.lockManager == nil {
return func() {}, nil
@@ -912,6 +965,7 @@ func (r *Plugin) handleWorkerMessage(workerID string, message *plugin_pb.WorkerT
switch body := message.Body.(type) {
case *plugin_pb.WorkerToAdminMessage_Hello:
r.registry.UpsertFromHello(body.Hello)
r.wakeScheduler()
case *plugin_pb.WorkerToAdminMessage_Heartbeat:
r.registry.UpdateHeartbeat(workerID, body.Heartbeat)
case *plugin_pb.WorkerToAdminMessage_ConfigSchemaResponse:
@@ -1011,6 +1065,7 @@ func (r *Plugin) ensureJobTypeConfigFromDescriptor(jobType string, descriptor *p
PerWorkerExecutionConcurrency: defaults.PerWorkerExecutionConcurrency,
RetryLimit: defaults.RetryLimit,
RetryBackoffSeconds: defaults.RetryBackoffSeconds,
JobTypeMaxRuntimeSeconds: defaults.JobTypeMaxRuntimeSeconds,
}
}

View File

@@ -861,6 +861,79 @@ func (r *Plugin) trackExecutionQueued(job *plugin_pb.JobSpec) {
})
}
func (r *Plugin) cancelQueuedJob(job *plugin_pb.JobSpec, cause error) {
reason := "job canceled"
if cause != nil {
reason = cause.Error()
}
r.markJobCanceled(job, reason)
}
func (r *Plugin) markJobCanceled(job *plugin_pb.JobSpec, reason string) {
if job == nil || strings.TrimSpace(job.JobId) == "" {
return
}
now := time.Now().UTC()
if strings.TrimSpace(reason) == "" {
reason = "job canceled"
}
r.jobsMu.Lock()
tracked := r.jobs[job.JobId]
if tracked == nil {
tracked = &TrackedJob{
JobID: job.JobId,
CreatedAt: timeToPtr(now),
}
r.jobs[job.JobId] = tracked
}
if job.JobType != "" {
tracked.JobType = job.JobType
}
tracked.State = StateCanceled
tracked.Stage = "canceled"
tracked.Message = reason
tracked.ErrorMessage = reason
tracked.Progress = 0
if tracked.CreatedAt == nil || tracked.CreatedAt.IsZero() {
tracked.CreatedAt = timeToPtr(now)
}
tracked.UpdatedAt = timeToPtr(now)
tracked.CompletedAt = timeToPtr(now)
trackedSnapshot := cloneTrackedJob(*tracked)
r.pruneTrackedJobsLocked()
r.dirtyJobs = true
r.jobsMu.Unlock()
r.persistJobDetailSnapshot(job.JobId, func(detail *TrackedJob) {
detail.JobID = job.JobId
if job.JobType != "" {
detail.JobType = job.JobType
}
detail.State = trackedSnapshot.State
detail.Stage = trackedSnapshot.Stage
detail.Message = trackedSnapshot.Message
detail.ErrorMessage = trackedSnapshot.ErrorMessage
detail.Progress = trackedSnapshot.Progress
if detail.CreatedAt == nil || detail.CreatedAt.IsZero() {
detail.CreatedAt = trackedSnapshot.CreatedAt
}
detail.UpdatedAt = trackedSnapshot.UpdatedAt
detail.CompletedAt = trackedSnapshot.CompletedAt
})
r.appendActivity(JobActivity{
JobID: job.JobId,
JobType: job.JobType,
Source: "admin_scheduler",
Message: reason,
Stage: "canceled",
OccurredAt: timeToPtr(now),
})
}
func (r *Plugin) trackExecutionCompletion(completed *plugin_pb.JobCompleted) *TrackedJob {
if completed == nil || strings.TrimSpace(completed.JobId) == "" {
return nil

View File

@@ -13,13 +13,17 @@ import (
"google.golang.org/protobuf/types/known/timestamppb"
)
var errExecutorAtCapacity = errors.New("executor is at capacity")
var (
errExecutorAtCapacity = errors.New("executor is at capacity")
errSchedulerShutdown = errors.New("scheduler shutdown")
)
const (
defaultSchedulerTick = 5 * time.Second
defaultScheduledDetectionInterval = 300 * time.Second
defaultScheduledDetectionTimeout = 45 * time.Second
defaultScheduledExecutionTimeout = 90 * time.Second
defaultScheduledJobTypeMaxRuntime = 30 * time.Minute
defaultScheduledMaxResults int32 = 1000
defaultScheduledExecutionConcurrency = 1
defaultScheduledPerWorkerConcurrency = 1
@@ -34,6 +38,7 @@ type schedulerPolicy struct {
DetectionInterval time.Duration
DetectionTimeout time.Duration
ExecutionTimeout time.Duration
JobTypeMaxRuntime time.Duration
RetryBackoff time.Duration
MaxResults int32
ExecutionConcurrency int
@@ -44,31 +49,72 @@ type schedulerPolicy struct {
func (r *Plugin) schedulerLoop() {
defer r.wg.Done()
ticker := time.NewTicker(r.schedulerTick)
defer ticker.Stop()
// Try once immediately on startup.
r.runSchedulerTick()
for {
select {
case <-r.shutdownCh:
return
case <-ticker.C:
r.runSchedulerTick()
default:
}
hadJobs := r.runSchedulerIteration()
r.recordSchedulerIterationComplete(hadJobs)
if hadJobs {
continue
}
r.setSchedulerLoopState("", "sleeping")
idleSleep := r.GetSchedulerConfig().IdleSleepDuration()
if nextRun := r.earliestNextDetectionAt(); !nextRun.IsZero() {
if until := time.Until(nextRun); until <= 0 {
idleSleep = 0
} else if until < idleSleep {
idleSleep = until
}
}
if idleSleep <= 0 {
continue
}
timer := time.NewTimer(idleSleep)
select {
case <-r.shutdownCh:
timer.Stop()
return
case <-r.schedulerWakeCh:
if !timer.Stop() {
<-timer.C
}
continue
case <-timer.C:
}
}
}
func (r *Plugin) runSchedulerTick() {
func (r *Plugin) runSchedulerIteration() bool {
r.expireStaleJobs(time.Now().UTC())
jobTypes := r.registry.DetectableJobTypes()
if len(jobTypes) == 0 {
return
r.setSchedulerLoopState("", "idle")
return false
}
r.setSchedulerLoopState("", "waiting_for_lock")
releaseLock, err := r.acquireAdminLock("plugin scheduler iteration")
if err != nil {
glog.Warningf("Plugin scheduler failed to acquire lock: %v", err)
r.setSchedulerLoopState("", "idle")
return false
}
if releaseLock != nil {
defer releaseLock()
}
active := make(map[string]struct{}, len(jobTypes))
schedulerIdleSleep := r.GetSchedulerConfig().IdleSleepDuration()
hadJobs := false
for _, jobType := range jobTypes {
active[jobType] = struct{}{}
@@ -81,20 +127,212 @@ func (r *Plugin) runSchedulerTick() {
r.clearSchedulerJobType(jobType)
continue
}
if !r.markDetectionDue(jobType, policy.DetectionInterval) {
initialDelay := time.Duration(0)
if runInfo := r.snapshotSchedulerRun(jobType); runInfo.lastRunStartedAt.IsZero() {
initialDelay = schedulerIdleSleep / 2
}
if !r.markDetectionDue(jobType, policy.DetectionInterval, initialDelay) {
continue
}
r.wg.Add(1)
go func(jt string, p schedulerPolicy) {
defer r.wg.Done()
r.runScheduledDetection(jt, p)
}(jobType, policy)
detected := r.runJobTypeIteration(jobType, policy)
if detected {
hadJobs = true
}
}
r.pruneSchedulerState(active)
r.pruneDetectorLeases(active)
r.setSchedulerLoopState("", "idle")
return hadJobs
}
func (r *Plugin) wakeScheduler() {
if r == nil {
return
}
select {
case r.schedulerWakeCh <- struct{}{}:
default:
}
}
func (r *Plugin) runJobTypeIteration(jobType string, policy schedulerPolicy) bool {
r.recordSchedulerRunStart(jobType)
r.clearWaitingJobQueue(jobType)
r.setSchedulerLoopState(jobType, "detecting")
r.markJobTypeInFlight(jobType)
defer r.finishDetection(jobType)
start := time.Now().UTC()
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: "scheduled detection started",
Stage: "detecting",
OccurredAt: timeToPtr(start),
})
if skip, waitingCount, waitingThreshold := r.shouldSkipDetectionForWaitingJobs(jobType, policy); skip {
r.recordSchedulerDetectionSkip(jobType, fmt.Sprintf("waiting backlog %d reached threshold %d", waitingCount, waitingThreshold))
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection skipped: waiting backlog %d reached threshold %d", waitingCount, waitingThreshold),
Stage: "skipped_waiting_backlog",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "skipped")
return false
}
maxRuntime := policy.JobTypeMaxRuntime
if maxRuntime <= 0 {
maxRuntime = defaultScheduledJobTypeMaxRuntime
}
jobCtx, cancel := context.WithTimeout(context.Background(), maxRuntime)
defer cancel()
clusterContext, err := r.loadSchedulerClusterContext(jobCtx)
if err != nil {
r.recordSchedulerDetectionError(jobType, err)
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection aborted: %v", err),
Stage: "failed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "error")
return false
}
detectionTimeout := policy.DetectionTimeout
remaining := time.Until(start.Add(maxRuntime))
if remaining <= 0 {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: "scheduled run timed out before detection",
Stage: "timeout",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "timeout")
return false
}
if detectionTimeout <= 0 {
detectionTimeout = defaultScheduledDetectionTimeout
}
if detectionTimeout > remaining {
detectionTimeout = remaining
}
detectCtx, cancelDetect := context.WithTimeout(jobCtx, detectionTimeout)
proposals, err := r.RunDetection(detectCtx, jobType, clusterContext, policy.MaxResults)
cancelDetect()
if err != nil {
r.recordSchedulerDetectionError(jobType, err)
stage := "failed"
status := "error"
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
stage = "timeout"
status = "timeout"
}
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection failed: %v", err),
Stage: stage,
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, status)
return false
}
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection completed: %d proposal(s)", len(proposals)),
Stage: "detected",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerDetectionSuccess(jobType, len(proposals))
detected := len(proposals) > 0
filteredByActive, skippedActive := r.filterProposalsWithActiveJobs(jobType, proposals)
if skippedActive > 0 {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection skipped %d proposal(s) due to active assigned/running jobs", skippedActive),
Stage: "deduped_active_jobs",
OccurredAt: timeToPtr(time.Now().UTC()),
})
}
if len(filteredByActive) == 0 {
r.recordSchedulerRunComplete(jobType, "success")
return detected
}
filtered := r.filterScheduledProposals(filteredByActive)
if len(filtered) != len(filteredByActive) {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection deduped %d proposal(s) within this run", len(filteredByActive)-len(filtered)),
Stage: "deduped",
OccurredAt: timeToPtr(time.Now().UTC()),
})
}
if len(filtered) == 0 {
r.recordSchedulerRunComplete(jobType, "success")
return detected
}
r.setSchedulerLoopState(jobType, "executing")
remaining = time.Until(start.Add(maxRuntime))
if remaining <= 0 {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: "scheduled execution skipped: job type max runtime reached",
Stage: "timeout",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, "timeout")
return detected
}
execPolicy := policy
if execPolicy.ExecutionTimeout <= 0 {
execPolicy.ExecutionTimeout = defaultScheduledExecutionTimeout
}
if execPolicy.ExecutionTimeout > remaining {
execPolicy.ExecutionTimeout = remaining
}
successCount, errorCount, canceledCount := r.dispatchScheduledProposals(jobCtx, jobType, filtered, clusterContext, execPolicy)
status := "success"
if jobCtx.Err() != nil {
status = "timeout"
} else if errorCount > 0 || canceledCount > 0 {
status = "error"
}
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled execution finished: success=%d error=%d canceled=%d", successCount, errorCount, canceledCount),
Stage: "executed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerRunComplete(jobType, status)
return detected
}
func (r *Plugin) loadSchedulerPolicy(jobType string) (schedulerPolicy, bool, error) {
@@ -119,6 +357,7 @@ func (r *Plugin) loadSchedulerPolicy(jobType string) (schedulerPolicy, bool, err
DetectionInterval: durationFromSeconds(adminRuntime.DetectionIntervalSeconds, defaultScheduledDetectionInterval),
DetectionTimeout: durationFromSeconds(adminRuntime.DetectionTimeoutSeconds, defaultScheduledDetectionTimeout),
ExecutionTimeout: defaultScheduledExecutionTimeout,
JobTypeMaxRuntime: durationFromSeconds(adminRuntime.JobTypeMaxRuntimeSeconds, defaultScheduledJobTypeMaxRuntime),
RetryBackoff: durationFromSeconds(adminRuntime.RetryBackoffSeconds, defaultScheduledRetryBackoff),
MaxResults: adminRuntime.MaxJobsPerDetection,
ExecutionConcurrency: int(adminRuntime.GlobalExecutionConcurrency),
@@ -148,6 +387,9 @@ func (r *Plugin) loadSchedulerPolicy(jobType string) (schedulerPolicy, bool, err
if policy.RetryLimit < 0 {
policy.RetryLimit = 0
}
if policy.JobTypeMaxRuntime <= 0 {
policy.JobTypeMaxRuntime = defaultScheduledJobTypeMaxRuntime
}
// Plugin protocol currently has only detection timeout in admin settings.
execTimeout := time.Duration(adminRuntime.DetectionTimeoutSeconds*2) * time.Second
@@ -199,6 +441,7 @@ func (r *Plugin) ListSchedulerStates() ([]SchedulerJobTypeState, error) {
state.DetectionIntervalSeconds = secondsFromDuration(policy.DetectionInterval)
state.DetectionTimeoutSeconds = secondsFromDuration(policy.DetectionTimeout)
state.ExecutionTimeoutSeconds = secondsFromDuration(policy.ExecutionTimeout)
state.JobTypeMaxRuntimeSeconds = secondsFromDuration(policy.JobTypeMaxRuntime)
state.MaxJobsPerDetection = policy.MaxResults
state.GlobalExecutionConcurrency = policy.ExecutionConcurrency
state.PerWorkerExecutionConcurrency = policy.PerWorkerConcurrency
@@ -207,6 +450,19 @@ func (r *Plugin) ListSchedulerStates() ([]SchedulerJobTypeState, error) {
}
}
runInfo := r.snapshotSchedulerRun(jobType)
if !runInfo.lastRunStartedAt.IsZero() {
at := runInfo.lastRunStartedAt
state.LastRunStartedAt = &at
}
if !runInfo.lastRunCompletedAt.IsZero() {
at := runInfo.lastRunCompletedAt
state.LastRunCompletedAt = &at
}
if runInfo.lastRunStatus != "" {
state.LastRunStatus = runInfo.lastRunStatus
}
leasedWorkerID := r.getDetectorLease(jobType)
if leasedWorkerID != "" {
state.DetectorWorkerID = leasedWorkerID
@@ -258,10 +514,11 @@ func deriveSchedulerAdminRuntime(
PerWorkerExecutionConcurrency: defaults.PerWorkerExecutionConcurrency,
RetryLimit: defaults.RetryLimit,
RetryBackoffSeconds: defaults.RetryBackoffSeconds,
JobTypeMaxRuntimeSeconds: defaults.JobTypeMaxRuntimeSeconds,
}
}
func (r *Plugin) markDetectionDue(jobType string, interval time.Duration) bool {
func (r *Plugin) markDetectionDue(jobType string, interval, initialDelay time.Duration) bool {
now := time.Now().UTC()
r.schedulerMu.Lock()
@@ -275,12 +532,43 @@ func (r *Plugin) markDetectionDue(jobType string, interval time.Duration) bool {
if exists && now.Before(nextRun) {
return false
}
if !exists && initialDelay > 0 {
r.nextDetectionAt[jobType] = now.Add(initialDelay)
return false
}
r.nextDetectionAt[jobType] = now.Add(interval)
r.detectionInFlight[jobType] = true
return true
}
func (r *Plugin) earliestNextDetectionAt() time.Time {
if r == nil {
return time.Time{}
}
r.schedulerMu.Lock()
defer r.schedulerMu.Unlock()
var earliest time.Time
for _, nextRun := range r.nextDetectionAt {
if nextRun.IsZero() {
continue
}
if earliest.IsZero() || nextRun.Before(earliest) {
earliest = nextRun
}
}
return earliest
}
func (r *Plugin) markJobTypeInFlight(jobType string) {
r.schedulerMu.Lock()
r.detectionInFlight[jobType] = true
r.schedulerMu.Unlock()
}
func (r *Plugin) finishDetection(jobType string) {
r.schedulerMu.Lock()
delete(r.detectionInFlight, jobType)
@@ -318,125 +606,18 @@ func (r *Plugin) pruneDetectorLeases(activeJobTypes map[string]struct{}) {
}
}
func (r *Plugin) runScheduledDetection(jobType string, policy schedulerPolicy) {
defer r.finishDetection(jobType)
releaseLock, lockErr := r.acquireAdminLock(fmt.Sprintf("plugin scheduled detection %s", jobType))
if lockErr != nil {
r.recordSchedulerDetectionError(jobType, lockErr)
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection aborted: failed to acquire lock: %v", lockErr),
Stage: "failed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
return
}
if releaseLock != nil {
defer releaseLock()
}
start := time.Now().UTC()
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: "scheduled detection started",
Stage: "detecting",
OccurredAt: timeToPtr(start),
})
if skip, waitingCount, waitingThreshold := r.shouldSkipDetectionForWaitingJobs(jobType, policy); skip {
r.recordSchedulerDetectionSkip(jobType, fmt.Sprintf("waiting backlog %d reached threshold %d", waitingCount, waitingThreshold))
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection skipped: waiting backlog %d reached threshold %d", waitingCount, waitingThreshold),
Stage: "skipped_waiting_backlog",
OccurredAt: timeToPtr(time.Now().UTC()),
})
return
}
clusterContext, err := r.loadSchedulerClusterContext()
if err != nil {
r.recordSchedulerDetectionError(jobType, err)
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection aborted: %v", err),
Stage: "failed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
return
}
ctx, cancel := context.WithTimeout(context.Background(), policy.DetectionTimeout)
proposals, err := r.RunDetection(ctx, jobType, clusterContext, policy.MaxResults)
cancel()
if err != nil {
r.recordSchedulerDetectionError(jobType, err)
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection failed: %v", err),
Stage: "failed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
return
}
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection completed: %d proposal(s)", len(proposals)),
Stage: "detected",
OccurredAt: timeToPtr(time.Now().UTC()),
})
r.recordSchedulerDetectionSuccess(jobType, len(proposals))
filteredByActive, skippedActive := r.filterProposalsWithActiveJobs(jobType, proposals)
if skippedActive > 0 {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection skipped %d proposal(s) due to active assigned/running jobs", skippedActive),
Stage: "deduped_active_jobs",
OccurredAt: timeToPtr(time.Now().UTC()),
})
}
if len(filteredByActive) == 0 {
return
}
filtered := r.filterScheduledProposals(filteredByActive)
if len(filtered) != len(filteredByActive) {
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled detection deduped %d proposal(s) within this run", len(filteredByActive)-len(filtered)),
Stage: "deduped",
OccurredAt: timeToPtr(time.Now().UTC()),
})
}
if len(filtered) == 0 {
return
}
r.dispatchScheduledProposals(jobType, filtered, clusterContext, policy)
}
func (r *Plugin) loadSchedulerClusterContext() (*plugin_pb.ClusterContext, error) {
func (r *Plugin) loadSchedulerClusterContext(ctx context.Context) (*plugin_pb.ClusterContext, error) {
if r.clusterContextProvider == nil {
return nil, fmt.Errorf("cluster context provider is not configured")
}
ctx, cancel := context.WithTimeout(context.Background(), defaultClusterContextTimeout)
if ctx == nil {
ctx = context.Background()
}
clusterCtx, cancel := context.WithTimeout(ctx, defaultClusterContextTimeout)
defer cancel()
clusterContext, err := r.clusterContextProvider(ctx)
clusterContext, err := r.clusterContextProvider(clusterCtx)
if err != nil {
return nil, err
}
@@ -447,11 +628,16 @@ func (r *Plugin) loadSchedulerClusterContext() (*plugin_pb.ClusterContext, error
}
func (r *Plugin) dispatchScheduledProposals(
ctx context.Context,
jobType string,
proposals []*plugin_pb.JobProposal,
clusterContext *plugin_pb.ClusterContext,
policy schedulerPolicy,
) {
) (int, int, int) {
if ctx == nil {
ctx = context.Background()
}
jobQueue := make(chan *plugin_pb.JobSpec, len(proposals))
for index, proposal := range proposals {
job := buildScheduledJobSpec(jobType, proposal, index)
@@ -459,7 +645,7 @@ func (r *Plugin) dispatchScheduledProposals(
select {
case <-r.shutdownCh:
close(jobQueue)
return
return 0, 0, 0
default:
jobQueue <- job
}
@@ -470,6 +656,7 @@ func (r *Plugin) dispatchScheduledProposals(
var statsMu sync.Mutex
successCount := 0
errorCount := 0
canceledCount := 0
workerCount := policy.ExecutionConcurrency
if workerCount < 1 {
@@ -481,6 +668,7 @@ func (r *Plugin) dispatchScheduledProposals(
go func() {
defer wg.Done()
jobLoop:
for job := range jobQueue {
select {
case <-r.shutdownCh:
@@ -488,19 +676,36 @@ func (r *Plugin) dispatchScheduledProposals(
default:
}
if ctx.Err() != nil {
r.cancelQueuedJob(job, ctx.Err())
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue
}
for {
select {
case <-r.shutdownCh:
return
default:
}
if ctx.Err() != nil {
r.cancelQueuedJob(job, ctx.Err())
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue jobLoop
}
executor, release, reserveErr := r.reserveScheduledExecutor(jobType, policy)
executor, release, reserveErr := r.reserveScheduledExecutor(ctx, jobType, policy)
if reserveErr != nil {
select {
case <-r.shutdownCh:
return
default:
if ctx.Err() != nil {
r.cancelQueuedJob(job, ctx.Err())
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue jobLoop
}
statsMu.Lock()
errorCount++
@@ -515,16 +720,23 @@ func (r *Plugin) dispatchScheduledProposals(
break
}
err := r.executeScheduledJobWithExecutor(executor, job, clusterContext, policy)
err := r.executeScheduledJobWithExecutor(ctx, executor, job, clusterContext, policy)
release()
if errors.Is(err, errExecutorAtCapacity) {
r.trackExecutionQueued(job)
if !waitForShutdownOrTimer(r.shutdownCh, policy.ExecutorReserveBackoff) {
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.ExecutorReserveBackoff) {
return
}
continue
}
if err != nil {
if ctx.Err() != nil || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
r.cancelQueuedJob(job, err)
statsMu.Lock()
canceledCount++
statsMu.Unlock()
continue jobLoop
}
statsMu.Lock()
errorCount++
statsMu.Unlock()
@@ -550,23 +762,34 @@ func (r *Plugin) dispatchScheduledProposals(
wg.Wait()
r.appendActivity(JobActivity{
JobType: jobType,
Source: "admin_scheduler",
Message: fmt.Sprintf("scheduled execution finished: success=%d error=%d", successCount, errorCount),
Stage: "executed",
OccurredAt: timeToPtr(time.Now().UTC()),
})
drainErr := ctx.Err()
if drainErr == nil {
drainErr = errSchedulerShutdown
}
for job := range jobQueue {
r.cancelQueuedJob(job, drainErr)
canceledCount++
}
return successCount, errorCount, canceledCount
}
func (r *Plugin) reserveScheduledExecutor(
ctx context.Context,
jobType string,
policy schedulerPolicy,
) (*WorkerSession, func(), error) {
if ctx == nil {
ctx = context.Background()
}
deadline := time.Now().Add(policy.ExecutionTimeout)
if policy.ExecutionTimeout <= 0 {
deadline = time.Now().Add(10 * time.Minute) // Default cap
}
if ctxDeadline, ok := ctx.Deadline(); ok && ctxDeadline.Before(deadline) {
deadline = ctxDeadline
}
for {
select {
@@ -574,6 +797,9 @@ func (r *Plugin) reserveScheduledExecutor(
return nil, nil, fmt.Errorf("plugin is shutting down")
default:
}
if ctx.Err() != nil {
return nil, nil, ctx.Err()
}
if time.Now().After(deadline) {
return nil, nil, fmt.Errorf("timed out waiting for executor capacity for %s", jobType)
@@ -581,7 +807,10 @@ func (r *Plugin) reserveScheduledExecutor(
executors, err := r.registry.ListExecutors(jobType)
if err != nil {
if !waitForShutdownOrTimer(r.shutdownCh, policy.ExecutorReserveBackoff) {
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.ExecutorReserveBackoff) {
if ctx.Err() != nil {
return nil, nil, ctx.Err()
}
return nil, nil, fmt.Errorf("plugin is shutting down")
}
continue
@@ -595,7 +824,10 @@ func (r *Plugin) reserveScheduledExecutor(
return executor, release, nil
}
if !waitForShutdownOrTimer(r.shutdownCh, policy.ExecutorReserveBackoff) {
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.ExecutorReserveBackoff) {
if ctx.Err() != nil {
return nil, nil, ctx.Err()
}
return nil, nil, fmt.Errorf("plugin is shutting down")
}
}
@@ -680,6 +912,7 @@ func schedulerWorkerExecutionLimit(executor *WorkerSession, jobType string, poli
}
func (r *Plugin) executeScheduledJobWithExecutor(
ctx context.Context,
executor *WorkerSession,
job *plugin_pb.JobSpec,
clusterContext *plugin_pb.ClusterContext,
@@ -697,8 +930,15 @@ func (r *Plugin) executeScheduledJobWithExecutor(
return fmt.Errorf("plugin is shutting down")
default:
}
if ctx != nil && ctx.Err() != nil {
return ctx.Err()
}
execCtx, cancel := context.WithTimeout(context.Background(), policy.ExecutionTimeout)
parent := ctx
if parent == nil {
parent = context.Background()
}
execCtx, cancel := context.WithTimeout(parent, policy.ExecutionTimeout)
_, err := r.executeJobWithExecutor(execCtx, executor, job, clusterContext, int32(attempt))
cancel()
if err == nil {
@@ -718,7 +958,10 @@ func (r *Plugin) executeScheduledJobWithExecutor(
Stage: "retry",
OccurredAt: timeToPtr(time.Now().UTC()),
})
if !waitForShutdownOrTimer(r.shutdownCh, policy.RetryBackoff) {
if !waitForShutdownOrTimerWithContext(r.shutdownCh, ctx, policy.RetryBackoff) {
if ctx != nil && ctx.Err() != nil {
return ctx.Err()
}
return fmt.Errorf("plugin is shutting down")
}
}
@@ -764,6 +1007,53 @@ func (r *Plugin) countWaitingTrackedJobs(jobType string) int {
return waiting
}
func (r *Plugin) clearWaitingJobQueue(jobType string) int {
normalizedJobType := strings.TrimSpace(jobType)
if normalizedJobType == "" {
return 0
}
jobIDs := make([]string, 0)
seen := make(map[string]struct{})
r.jobsMu.RLock()
for _, job := range r.jobs {
if job == nil {
continue
}
if strings.TrimSpace(job.JobType) != normalizedJobType {
continue
}
if !isWaitingTrackedJobState(job.State) {
continue
}
jobID := strings.TrimSpace(job.JobID)
if jobID == "" {
continue
}
if _, ok := seen[jobID]; ok {
continue
}
seen[jobID] = struct{}{}
jobIDs = append(jobIDs, jobID)
}
r.jobsMu.RUnlock()
if len(jobIDs) == 0 {
return 0
}
reason := fmt.Sprintf("cleared queued job before %s run", normalizedJobType)
for _, jobID := range jobIDs {
r.markJobCanceled(&plugin_pb.JobSpec{
JobId: jobID,
JobType: normalizedJobType,
}, reason)
}
return len(jobIDs)
}
func waitingBacklogThreshold(policy schedulerPolicy) int {
concurrency := policy.ExecutionConcurrency
if concurrency <= 0 {
@@ -861,6 +1151,27 @@ func waitForShutdownOrTimer(shutdown <-chan struct{}, duration time.Duration) bo
}
}
func waitForShutdownOrTimerWithContext(shutdown <-chan struct{}, ctx context.Context, duration time.Duration) bool {
if duration <= 0 {
return true
}
if ctx == nil {
ctx = context.Background()
}
timer := time.NewTimer(duration)
defer timer.Stop()
select {
case <-shutdown:
return false
case <-ctx.Done():
return false
case <-timer.C:
return true
}
}
// filterProposalsWithActiveJobs removes proposals whose dedupe keys already have active jobs.
// It first expires stale tracked jobs via expireStaleJobs, which can mutate scheduler state,
// so callers should treat this method as a stateful operation.

View File

@@ -1,6 +1,7 @@
package plugin
import (
"context"
"fmt"
"testing"
"time"
@@ -28,6 +29,7 @@ func TestLoadSchedulerPolicyUsesAdminConfig(t *testing.T) {
PerWorkerExecutionConcurrency: 2,
RetryLimit: 4,
RetryBackoffSeconds: 7,
JobTypeMaxRuntimeSeconds: 1800,
},
})
if err != nil {
@@ -53,6 +55,9 @@ func TestLoadSchedulerPolicyUsesAdminConfig(t *testing.T) {
if policy.RetryLimit != 4 {
t.Fatalf("unexpected retry limit: got=%d", policy.RetryLimit)
}
if policy.JobTypeMaxRuntime != 30*time.Minute {
t.Fatalf("unexpected max runtime: got=%v", policy.JobTypeMaxRuntime)
}
}
func TestLoadSchedulerPolicyUsesDescriptorDefaultsWhenConfigMissing(t *testing.T) {
@@ -75,6 +80,7 @@ func TestLoadSchedulerPolicyUsesDescriptorDefaultsWhenConfigMissing(t *testing.T
PerWorkerExecutionConcurrency: 2,
RetryLimit: 3,
RetryBackoffSeconds: 6,
JobTypeMaxRuntimeSeconds: 1200,
},
})
if err != nil {
@@ -97,6 +103,9 @@ func TestLoadSchedulerPolicyUsesDescriptorDefaultsWhenConfigMissing(t *testing.T
if policy.PerWorkerConcurrency != 2 {
t.Fatalf("unexpected per-worker concurrency: got=%d", policy.PerWorkerConcurrency)
}
if policy.JobTypeMaxRuntime != 20*time.Minute {
t.Fatalf("unexpected max runtime: got=%v", policy.JobTypeMaxRuntime)
}
}
func TestReserveScheduledExecutorRespectsPerWorkerLimit(t *testing.T) {
@@ -126,13 +135,13 @@ func TestReserveScheduledExecutorRespectsPerWorkerLimit(t *testing.T) {
ExecutorReserveBackoff: time.Millisecond,
}
executor1, release1, err := pluginSvc.reserveScheduledExecutor("balance", policy)
executor1, release1, err := pluginSvc.reserveScheduledExecutor(context.Background(), "balance", policy)
if err != nil {
t.Fatalf("reserve executor 1: %v", err)
}
defer release1()
executor2, release2, err := pluginSvc.reserveScheduledExecutor("balance", policy)
executor2, release2, err := pluginSvc.reserveScheduledExecutor(context.Background(), "balance", policy)
if err != nil {
t.Fatalf("reserve executor 2: %v", err)
}
@@ -254,7 +263,7 @@ func TestReserveScheduledExecutorTimesOutWhenNoExecutor(t *testing.T) {
start := time.Now()
pluginSvc.Shutdown()
_, _, err = pluginSvc.reserveScheduledExecutor("missing-job-type", policy)
_, _, err = pluginSvc.reserveScheduledExecutor(context.Background(), "missing-job-type", policy)
if err == nil {
t.Fatalf("expected reservation shutdown error")
}
@@ -285,7 +294,7 @@ func TestReserveScheduledExecutorWaitsForWorkerCapacity(t *testing.T) {
ExecutorReserveBackoff: 5 * time.Millisecond,
}
_, release1, err := pluginSvc.reserveScheduledExecutor("balance", policy)
_, release1, err := pluginSvc.reserveScheduledExecutor(context.Background(), "balance", policy)
if err != nil {
t.Fatalf("reserve executor 1: %v", err)
}
@@ -296,7 +305,7 @@ func TestReserveScheduledExecutorWaitsForWorkerCapacity(t *testing.T) {
}
secondReserveCh := make(chan reserveResult, 1)
go func() {
_, release2, reserveErr := pluginSvc.reserveScheduledExecutor("balance", policy)
_, release2, reserveErr := pluginSvc.reserveScheduledExecutor(context.Background(), "balance", policy)
if release2 != nil {
release2()
}
@@ -394,6 +403,7 @@ func TestListSchedulerStatesIncludesPolicyAndState(t *testing.T) {
PerWorkerExecutionConcurrency: 2,
RetryLimit: 1,
RetryBackoffSeconds: 9,
JobTypeMaxRuntimeSeconds: 900,
},
})
if err != nil {
@@ -446,6 +456,9 @@ func TestListSchedulerStatesIncludesPolicyAndState(t *testing.T) {
if state.ExecutionTimeoutSeconds != 90 {
t.Fatalf("unexpected execution timeout: got=%d", state.ExecutionTimeoutSeconds)
}
if state.JobTypeMaxRuntimeSeconds != 900 {
t.Fatalf("unexpected job type max runtime: got=%d", state.JobTypeMaxRuntimeSeconds)
}
if state.MaxJobsPerDetection != 80 {
t.Fatalf("unexpected max jobs per detection: got=%d", state.MaxJobsPerDetection)
}

View File

@@ -0,0 +1,31 @@
package plugin
import "time"
const (
defaultSchedulerIdleSleep = 613 * time.Second
)
type SchedulerConfig struct {
IdleSleepSeconds int32 `json:"idle_sleep_seconds"`
}
func DefaultSchedulerConfig() SchedulerConfig {
return SchedulerConfig{
IdleSleepSeconds: int32(defaultSchedulerIdleSleep / time.Second),
}
}
func normalizeSchedulerConfig(cfg SchedulerConfig) SchedulerConfig {
if cfg.IdleSleepSeconds <= 0 {
return DefaultSchedulerConfig()
}
return cfg
}
func (c SchedulerConfig) IdleSleepDuration() time.Duration {
if c.IdleSleepSeconds <= 0 {
return defaultSchedulerIdleSleep
}
return time.Duration(c.IdleSleepSeconds) * time.Second
}

View File

@@ -9,6 +9,12 @@ import (
type SchedulerStatus struct {
Now time.Time `json:"now"`
SchedulerTickSeconds int `json:"scheduler_tick_seconds"`
IdleSleepSeconds int `json:"idle_sleep_seconds,omitempty"`
NextDetectionAt *time.Time `json:"next_detection_at,omitempty"`
CurrentJobType string `json:"current_job_type,omitempty"`
CurrentPhase string `json:"current_phase,omitempty"`
LastIterationHadJobs bool `json:"last_iteration_had_jobs,omitempty"`
LastIterationDoneAt *time.Time `json:"last_iteration_done_at,omitempty"`
Waiting []SchedulerWaitingStatus `json:"waiting,omitempty"`
InProcessJobs []SchedulerJobStatus `json:"in_process_jobs,omitempty"`
JobTypes []SchedulerJobTypeStatus `json:"job_types,omitempty"`
@@ -56,6 +62,19 @@ type schedulerDetectionInfo struct {
lastSkippedReason string
}
type schedulerRunInfo struct {
lastRunStartedAt time.Time
lastRunCompletedAt time.Time
lastRunStatus string
}
type schedulerLoopState struct {
currentJobType string
currentPhase string
lastIterationHadJobs bool
lastIterationCompleted time.Time
}
func (r *Plugin) recordSchedulerDetectionSuccess(jobType string, count int) {
if r == nil {
return
@@ -122,12 +141,105 @@ func (r *Plugin) snapshotSchedulerDetection(jobType string) schedulerDetectionIn
return *info
}
func (r *Plugin) recordSchedulerRunStart(jobType string) {
if r == nil {
return
}
r.schedulerRunMu.Lock()
defer r.schedulerRunMu.Unlock()
info := r.schedulerRun[jobType]
if info == nil {
info = &schedulerRunInfo{}
r.schedulerRun[jobType] = info
}
info.lastRunStartedAt = time.Now().UTC()
info.lastRunStatus = ""
}
func (r *Plugin) recordSchedulerRunComplete(jobType, status string) {
if r == nil {
return
}
r.schedulerRunMu.Lock()
defer r.schedulerRunMu.Unlock()
info := r.schedulerRun[jobType]
if info == nil {
info = &schedulerRunInfo{}
r.schedulerRun[jobType] = info
}
info.lastRunCompletedAt = time.Now().UTC()
info.lastRunStatus = status
}
func (r *Plugin) snapshotSchedulerRun(jobType string) schedulerRunInfo {
if r == nil {
return schedulerRunInfo{}
}
r.schedulerRunMu.Lock()
defer r.schedulerRunMu.Unlock()
info := r.schedulerRun[jobType]
if info == nil {
return schedulerRunInfo{}
}
return *info
}
func (r *Plugin) setSchedulerLoopState(jobType, phase string) {
if r == nil {
return
}
r.schedulerLoopMu.Lock()
r.schedulerLoopState.currentJobType = jobType
r.schedulerLoopState.currentPhase = phase
r.schedulerLoopMu.Unlock()
}
func (r *Plugin) recordSchedulerIterationComplete(hadJobs bool) {
if r == nil {
return
}
r.schedulerLoopMu.Lock()
r.schedulerLoopState.lastIterationHadJobs = hadJobs
r.schedulerLoopState.lastIterationCompleted = time.Now().UTC()
r.schedulerLoopMu.Unlock()
}
func (r *Plugin) snapshotSchedulerLoopState() schedulerLoopState {
if r == nil {
return schedulerLoopState{}
}
r.schedulerLoopMu.Lock()
defer r.schedulerLoopMu.Unlock()
return r.schedulerLoopState
}
func (r *Plugin) GetSchedulerStatus() SchedulerStatus {
now := time.Now().UTC()
loopState := r.snapshotSchedulerLoopState()
schedulerConfig := r.GetSchedulerConfig()
status := SchedulerStatus{
Now: now,
SchedulerTickSeconds: int(secondsFromDuration(r.schedulerTick)),
InProcessJobs: r.listInProcessJobs(now),
IdleSleepSeconds: int(schedulerConfig.IdleSleepSeconds),
CurrentJobType: loopState.currentJobType,
CurrentPhase: loopState.currentPhase,
LastIterationHadJobs: loopState.lastIterationHadJobs,
}
nextDetectionAt := r.earliestNextDetectionAt()
if nextDetectionAt.IsZero() && loopState.currentPhase == "sleeping" && !loopState.lastIterationCompleted.IsZero() {
idleSleep := schedulerConfig.IdleSleepDuration()
if idleSleep > 0 {
nextDetectionAt = loopState.lastIterationCompleted.Add(idleSleep)
}
}
if !nextDetectionAt.IsZero() {
at := nextDetectionAt
status.NextDetectionAt = &at
}
if !loopState.lastIterationCompleted.IsZero() {
at := loopState.lastIterationCompleted
status.LastIterationDoneAt = &at
}
states, err := r.ListSchedulerStates()

View File

@@ -90,6 +90,7 @@ type SchedulerJobTypeState struct {
DetectionIntervalSeconds int32 `json:"detection_interval_seconds,omitempty"`
DetectionTimeoutSeconds int32 `json:"detection_timeout_seconds,omitempty"`
ExecutionTimeoutSeconds int32 `json:"execution_timeout_seconds,omitempty"`
JobTypeMaxRuntimeSeconds int32 `json:"job_type_max_runtime_seconds,omitempty"`
MaxJobsPerDetection int32 `json:"max_jobs_per_detection,omitempty"`
GlobalExecutionConcurrency int `json:"global_execution_concurrency,omitempty"`
PerWorkerExecutionConcurrency int `json:"per_worker_execution_concurrency,omitempty"`
@@ -98,6 +99,9 @@ type SchedulerJobTypeState struct {
DetectorAvailable bool `json:"detector_available"`
DetectorWorkerID string `json:"detector_worker_id,omitempty"`
ExecutorWorkerCount int `json:"executor_worker_count"`
LastRunStartedAt *time.Time `json:"last_run_started_at,omitempty"`
LastRunCompletedAt *time.Time `json:"last_run_completed_at,omitempty"`
LastRunStatus string `json:"last_run_status,omitempty"`
}
func timeToPtr(t time.Time) *time.Time {