admin: fix manual job run to use scheduler dispatch with capacity management and retry (#8720)
RunPluginJobTypeAPI previously executed proposals with a naive sequential loop calling ExecutePluginJob per proposal. This had two bugs: 1. Double-lock: RunPluginJobTypeAPI held pluginLock while calling ExecutePluginJob, which tried to re-acquire the same lock for every job in the loop. 2. No capacity management: proposals were fired directly at workers without reserveScheduledExecutor, so every job beyond the worker concurrency limit received an immediate at_capacity error with no retry or backoff. Fix: add Plugin.DispatchProposals which reuses dispatchScheduledProposals - the same code path the scheduler loop uses - with executor reservation, configurable concurrency, and per-job retry with backoff. RunPluginJobTypeAPI now calls DispatchPluginProposals (a thin AdminServer wrapper) after holding pluginLock once. Co-authored-by: Anton Ustyugov <anton@devops>
This commit is contained in:
@@ -1287,6 +1287,35 @@ func isWaitingTrackedJobState(state string) bool {
|
||||
return normalized == "pending" || normalized == "job_state_pending"
|
||||
}
|
||||
|
||||
// DispatchProposals dispatches a batch of proposals using the same capacity-aware
|
||||
// dispatch logic as the scheduler loop: concurrent execution, executor reservation
|
||||
// with backoff, and per-job retry on transient errors. The scheduler policy is
|
||||
// loaded from the persisted job type config; if the job type has no config or is
|
||||
// disabled a sensible default policy is used so manual runs always work.
|
||||
func (r *Plugin) DispatchProposals(
|
||||
ctx context.Context,
|
||||
jobType string,
|
||||
proposals []*plugin_pb.JobProposal,
|
||||
clusterContext *plugin_pb.ClusterContext,
|
||||
) (successCount, errorCount, canceledCount int) {
|
||||
if len(proposals) == 0 {
|
||||
return 0, 0, 0
|
||||
}
|
||||
|
||||
policy, enabled, err := r.loadSchedulerPolicy(jobType)
|
||||
if err != nil || !enabled {
|
||||
policy = schedulerPolicy{
|
||||
ExecutionConcurrency: defaultScheduledExecutionConcurrency,
|
||||
PerWorkerConcurrency: defaultScheduledPerWorkerConcurrency,
|
||||
ExecutionTimeout: defaultScheduledExecutionTimeout,
|
||||
RetryBackoff: defaultScheduledRetryBackoff,
|
||||
ExecutorReserveBackoff: 200 * time.Millisecond,
|
||||
}
|
||||
}
|
||||
|
||||
return r.dispatchScheduledProposals(ctx, jobType, proposals, clusterContext, policy)
|
||||
}
|
||||
|
||||
func (r *Plugin) filterScheduledProposals(proposals []*plugin_pb.JobProposal) []*plugin_pb.JobProposal {
|
||||
filtered := make([]*plugin_pb.JobProposal, 0, len(proposals))
|
||||
seenInRun := make(map[string]struct{}, len(proposals))
|
||||
|
||||
Reference in New Issue
Block a user