add admin script worker (#8491)

* admin: add plugin lock coordination

* shell: allow bypassing lock checks

* plugin worker: add admin script handler

* mini: include admin_script in plugin defaults

* admin script UI: drop name and enlarge text

* admin script: add default script

* admin_script: make run interval configurable

* plugin: gate other jobs during admin_script runs

* plugin: use last completed admin_script run

* admin: backfill plugin config defaults

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* comparable to default version

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* default to run

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* format

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* shell: respect pre-set noLock for fix.replication

* shell: add force no-lock mode for admin scripts

* volume balance worker already exists

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* admin: expose scheduler status JSON

* shell: add sleep command

* shell: restrict sleep syntax

* Revert "shell: respect pre-set noLock for fix.replication"

This reverts commit 2b14e8b82602a740d3a473c085e3b3a14f1ddbb3.

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* fix import

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* less logs

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* Reduce master client logs on canceled contexts

* Update mini default job type count

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Chris Lu
2026-03-03 15:10:40 -08:00
committed by GitHub
parent 16f2269a33
commit e1e5b4a8a6
27 changed files with 1888 additions and 27 deletions

View File

@@ -98,6 +98,7 @@ type AdminServer struct {
// Maintenance system
maintenanceManager *maintenance.MaintenanceManager
plugin *adminplugin.Plugin
pluginLock *AdminLockManager
expireJobHandler func(jobID string, reason string) (*adminplugin.TrackedJob, bool, error)
// Topic retention purger
@@ -135,6 +136,8 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
ctx := context.Background()
go masterClient.KeepConnectedToMaster(ctx)
lockManager := NewAdminLockManager(masterClient, adminLockClientName)
server := &AdminServer{
masterClient: masterClient,
templateFS: templateFS,
@@ -146,6 +149,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
collectionStatsCacheThreshold: defaultStatsCacheTimeout,
s3TablesManager: newS3TablesManager(),
icebergPort: icebergPort,
pluginLock: lockManager,
}
// Initialize topic retention purger
@@ -229,6 +233,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
ClusterContextProvider: func(_ context.Context) (*plugin_pb.ClusterContext, error) {
return server.buildDefaultPluginClusterContext(), nil
},
LockManager: lockManager,
})
if err != nil && dataDir != "" {
glog.Warningf("Failed to initialize plugin with dataDir=%q: %v. Falling back to in-memory plugin state.", dataDir, err)
@@ -237,6 +242,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
ClusterContextProvider: func(_ context.Context) (*plugin_pb.ClusterContext, error) {
return server.buildDefaultPluginClusterContext(), nil
},
LockManager: lockManager,
})
}
if err != nil {
@@ -890,6 +896,13 @@ func (s *AdminServer) GetPlugin() *adminplugin.Plugin {
return s.plugin
}
func (s *AdminServer) acquirePluginLock(reason string) (func(), error) {
if s == nil || s.pluginLock == nil {
return func() {}, nil
}
return s.pluginLock.Acquire(reason)
}
// RequestPluginJobTypeDescriptor asks one worker for job type schema and returns the descriptor.
func (s *AdminServer) RequestPluginJobTypeDescriptor(ctx context.Context, jobType string, forceRefresh bool) (*plugin_pb.JobTypeDescriptor, error) {
if s.plugin == nil {
@@ -932,6 +945,13 @@ func (s *AdminServer) RunPluginDetection(
if s.plugin == nil {
return nil, fmt.Errorf("plugin is not enabled")
}
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detection %s", jobType))
if err != nil {
return nil, err
}
if releaseLock != nil {
defer releaseLock()
}
return s.plugin.RunDetection(ctx, jobType, clusterContext, maxResults)
}
@@ -957,6 +977,13 @@ func (s *AdminServer) RunPluginDetectionWithReport(
if s.plugin == nil {
return nil, fmt.Errorf("plugin is not enabled")
}
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detection %s", jobType))
if err != nil {
return nil, err
}
if releaseLock != nil {
defer releaseLock()
}
return s.plugin.RunDetectionWithReport(ctx, jobType, clusterContext, maxResults)
}
@@ -970,6 +997,17 @@ func (s *AdminServer) ExecutePluginJob(
if s.plugin == nil {
return nil, fmt.Errorf("plugin is not enabled")
}
jobType := ""
if job != nil {
jobType = strings.TrimSpace(job.JobType)
}
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin execution %s", jobType))
if err != nil {
return nil, err
}
if releaseLock != nil {
defer releaseLock()
}
return s.plugin.ExecuteJob(ctx, job, clusterContext, attempt)
}