add admin script worker (#8491)

* admin: add plugin lock coordination

* shell: allow bypassing lock checks

* plugin worker: add admin script handler

* mini: include admin_script in plugin defaults

* admin script UI: drop name and enlarge text

* admin script: add default script

* admin_script: make run interval configurable

* plugin: gate other jobs during admin_script runs

* plugin: use last completed admin_script run

* admin: backfill plugin config defaults

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* comparable to default version

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* default to run

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* format

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* shell: respect pre-set noLock for fix.replication

* shell: add force no-lock mode for admin scripts

* volume balance worker already exists

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* admin: expose scheduler status JSON

* shell: add sleep command

* shell: restrict sleep syntax

* Revert "shell: respect pre-set noLock for fix.replication"

This reverts commit 2b14e8b82602a740d3a473c085e3b3a14f1ddbb3.

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* fix import

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* less logs

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* Reduce master client logs on canceled contexts

* Update mini default job type count

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Chris Lu
2026-03-03 15:10:40 -08:00
committed by GitHub
parent 16f2269a33
commit e1e5b4a8a6
27 changed files with 1888 additions and 27 deletions

View File

@@ -24,6 +24,7 @@ const (
defaultHeartbeatInterval = 30
defaultReconnectDelay = 5
defaultPendingSchemaBuffer = 1
adminScriptJobType = "admin_script"
)
type Options struct {
@@ -32,6 +33,7 @@ type Options struct {
SendTimeout time.Duration
SchedulerTick time.Duration
ClusterContextProvider func(context.Context) (*plugin_pb.ClusterContext, error)
LockManager LockManager
}
// JobTypeInfo contains metadata about a plugin job type.
@@ -52,6 +54,7 @@ type Plugin struct {
schedulerTick time.Duration
clusterContextProvider func(context.Context) (*plugin_pb.ClusterContext, error)
lockManager LockManager
schedulerMu sync.Mutex
nextDetectionAt map[string]time.Time
@@ -62,6 +65,9 @@ type Plugin struct {
schedulerExecMu sync.Mutex
schedulerExecReservations map[string]int
adminScriptRunMu sync.RWMutex
schedulerDetectionMu sync.Mutex
schedulerDetection map[string]*schedulerDetectionInfo
dedupeMu sync.Mutex
recentDedupeByType map[string]map[string]time.Time
@@ -148,6 +154,7 @@ func New(options Options) (*Plugin, error) {
sendTimeout: sendTimeout,
schedulerTick: schedulerTick,
clusterContextProvider: options.ClusterContextProvider,
lockManager: options.LockManager,
sessions: make(map[string]*streamSession),
pendingSchema: make(map[string]chan *plugin_pb.ConfigSchemaResponse),
pendingDetection: make(map[string]*pendingDetectionState),
@@ -156,6 +163,7 @@ func New(options Options) (*Plugin, error) {
detectionInFlight: make(map[string]bool),
detectorLeases: make(map[string]string),
schedulerExecReservations: make(map[string]int),
schedulerDetection: make(map[string]*schedulerDetectionInfo),
recentDedupeByType: make(map[string]map[string]time.Time),
jobs: make(map[string]*TrackedJob),
activities: make([]JobActivity, 0, 256),
@@ -382,6 +390,13 @@ func (r *Plugin) BaseDir() string {
return r.store.BaseDir()
}
func (r *Plugin) acquireAdminLock(reason string) (func(), error) {
if r == nil || r.lockManager == nil {
return func() {}, nil
}
return r.lockManager.Acquire(reason)
}
// RunDetectionWithReport requests one detector worker and returns proposals with request metadata.
func (r *Plugin) RunDetectionWithReport(
ctx context.Context,
@@ -389,6 +404,9 @@ func (r *Plugin) RunDetectionWithReport(
clusterContext *plugin_pb.ClusterContext,
maxResults int32,
) (*DetectionReport, error) {
releaseGate := r.acquireDetectionExecutionGate(jobType, false)
defer releaseGate()
detector, err := r.pickDetector(jobType)
if err != nil {
return nil, err
@@ -403,7 +421,10 @@ func (r *Plugin) RunDetectionWithReport(
if err != nil {
return nil, err
}
lastSuccessfulRun := r.loadLastSuccessfulRun(jobType)
lastCompletedRun := r.loadLastSuccessfulRun(jobType)
if strings.EqualFold(strings.TrimSpace(jobType), adminScriptJobType) {
lastCompletedRun = r.loadLastCompletedRun(jobType)
}
state := &pendingDetectionState{
complete: make(chan *plugin_pb.DetectionComplete, 1),
@@ -444,7 +465,7 @@ func (r *Plugin) RunDetectionWithReport(
AdminConfigValues: adminConfigValues,
WorkerConfigValues: workerConfigValues,
ClusterContext: clusterContext,
LastSuccessfulRun: lastSuccessfulRun,
LastSuccessfulRun: lastCompletedRun,
MaxResults: maxResults,
},
},
@@ -531,11 +552,14 @@ func (r *Plugin) ExecuteJob(
if job == nil {
return nil, fmt.Errorf("job is nil")
}
if strings.TrimSpace(job.JobType) == "" {
jobType := strings.TrimSpace(job.JobType)
if jobType == "" {
return nil, fmt.Errorf("job_type is required")
}
releaseGate := r.acquireDetectionExecutionGate(jobType, true)
defer releaseGate()
executor, err := r.registry.PickExecutor(job.JobType)
executor, err := r.registry.PickExecutor(jobType)
if err != nil {
return nil, err
}
@@ -543,6 +567,23 @@ func (r *Plugin) ExecuteJob(
return r.executeJobWithExecutor(ctx, executor, job, clusterContext, attempt)
}
func (r *Plugin) acquireDetectionExecutionGate(jobType string, execution bool) func() {
normalizedJobType := strings.ToLower(strings.TrimSpace(jobType))
if execution && normalizedJobType == adminScriptJobType {
r.adminScriptRunMu.Lock()
return func() {
r.adminScriptRunMu.Unlock()
}
}
if normalizedJobType != adminScriptJobType {
r.adminScriptRunMu.RLock()
return func() {
r.adminScriptRunMu.RUnlock()
}
}
return func() {}
}
func (r *Plugin) executeJobWithExecutor(
ctx context.Context,
executor *WorkerSession,
@@ -1291,6 +1332,41 @@ func (r *Plugin) loadLastSuccessfulRun(jobType string) *timestamppb.Timestamp {
return timestamppb.New(latest.UTC())
}
func (r *Plugin) loadLastCompletedRun(jobType string) *timestamppb.Timestamp {
history, err := r.store.LoadRunHistory(jobType)
if err != nil {
glog.Warningf("Plugin failed to load run history for %s: %v", jobType, err)
return nil
}
if history == nil {
return nil
}
var latest time.Time
for i := range history.SuccessfulRuns {
completedAt := history.SuccessfulRuns[i].CompletedAt
if completedAt == nil || completedAt.IsZero() {
continue
}
if latest.IsZero() || completedAt.After(latest) {
latest = *completedAt
}
}
for i := range history.ErrorRuns {
completedAt := history.ErrorRuns[i].CompletedAt
if completedAt == nil || completedAt.IsZero() {
continue
}
if latest.IsZero() || completedAt.After(latest) {
latest = *completedAt
}
}
if latest.IsZero() {
return nil
}
return timestamppb.New(latest.UTC())
}
func CloneConfigValueMap(in map[string]*plugin_pb.ConfigValue) map[string]*plugin_pb.ConfigValue {
if len(in) == 0 {
return map[string]*plugin_pb.ConfigValue{}