add admin script worker (#8491)

* admin: add plugin lock coordination

* shell: allow bypassing lock checks

* plugin worker: add admin script handler

* mini: include admin_script in plugin defaults

* admin script UI: drop name and enlarge text

* admin script: add default script

* admin_script: make run interval configurable

* plugin: gate other jobs during admin_script runs

* plugin: use last completed admin_script run

* admin: backfill plugin config defaults

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* comparable to default version

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* default to run

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* format

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* shell: respect pre-set noLock for fix.replication

* shell: add force no-lock mode for admin scripts

* volume balance worker already exists

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* admin: expose scheduler status JSON

* shell: add sleep command

* shell: restrict sleep syntax

* Revert "shell: respect pre-set noLock for fix.replication"

This reverts commit 2b14e8b82602a740d3a473c085e3b3a14f1ddbb3.

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* fix import

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* less logs

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* Reduce master client logs on canceled contexts

* Update mini default job type count

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Chris Lu
2026-03-03 15:10:40 -08:00
committed by GitHub
parent 16f2269a33
commit e1e5b4a8a6
27 changed files with 1888 additions and 27 deletions

View File

@@ -0,0 +1,148 @@
package dash
import (
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
"github.com/seaweedfs/seaweedfs/weed/wdclient/exclusive_locks"
)
const (
adminLockName = "shell"
adminLockClientName = "admin-plugin"
)
// AdminLockManager coordinates exclusive admin locks with reference counting.
// It is safe for concurrent use.
type AdminLockManager struct {
locker *exclusive_locks.ExclusiveLocker
clientName string
mu sync.Mutex
cond *sync.Cond
acquiring bool
holdCount int
lastAcquiredAt time.Time
lastReleasedAt time.Time
waitingSince time.Time
waitingReason string
currentReason string
}
func NewAdminLockManager(masterClient *wdclient.MasterClient, clientName string) *AdminLockManager {
if masterClient == nil {
return nil
}
if clientName == "" {
clientName = adminLockClientName
}
manager := &AdminLockManager{
locker: exclusive_locks.NewExclusiveLocker(masterClient, adminLockName),
clientName: clientName,
}
manager.cond = sync.NewCond(&manager.mu)
return manager
}
func (m *AdminLockManager) Acquire(reason string) (func(), error) {
if m == nil || m.locker == nil {
return func() {}, nil
}
m.mu.Lock()
if reason != "" {
m.locker.SetMessage(reason)
m.currentReason = reason
}
for m.acquiring {
m.cond.Wait()
}
if m.holdCount == 0 {
m.acquiring = true
m.waitingSince = time.Now().UTC()
m.waitingReason = reason
m.mu.Unlock()
m.locker.RequestLock(m.clientName)
m.mu.Lock()
m.acquiring = false
m.holdCount = 1
m.lastAcquiredAt = time.Now().UTC()
m.waitingSince = time.Time{}
m.waitingReason = ""
m.cond.Broadcast()
m.mu.Unlock()
return m.Release, nil
}
m.holdCount++
if reason != "" {
m.currentReason = reason
}
m.mu.Unlock()
return m.Release, nil
}
func (m *AdminLockManager) Release() {
if m == nil || m.locker == nil {
return
}
m.mu.Lock()
if m.holdCount <= 0 {
m.mu.Unlock()
return
}
m.holdCount--
shouldRelease := m.holdCount == 0
m.mu.Unlock()
if shouldRelease {
m.mu.Lock()
m.lastReleasedAt = time.Now().UTC()
m.currentReason = ""
m.mu.Unlock()
m.locker.ReleaseLock()
}
}
type LockStatus struct {
Held bool `json:"held"`
HoldCount int `json:"hold_count"`
Acquiring bool `json:"acquiring"`
Message string `json:"message,omitempty"`
WaitingReason string `json:"waiting_reason,omitempty"`
LastAcquiredAt *time.Time `json:"last_acquired_at,omitempty"`
LastReleasedAt *time.Time `json:"last_released_at,omitempty"`
WaitingSince *time.Time `json:"waiting_since,omitempty"`
}
func (m *AdminLockManager) Status() LockStatus {
if m == nil {
return LockStatus{}
}
m.mu.Lock()
defer m.mu.Unlock()
status := LockStatus{
Held: m.holdCount > 0,
HoldCount: m.holdCount,
Acquiring: m.acquiring,
Message: m.currentReason,
WaitingReason: m.waitingReason,
}
if !m.lastAcquiredAt.IsZero() {
at := m.lastAcquiredAt
status.LastAcquiredAt = &at
}
if !m.lastReleasedAt.IsZero() {
at := m.lastReleasedAt
status.LastReleasedAt = &at
}
if !m.waitingSince.IsZero() {
at := m.waitingSince
status.WaitingSince = &at
}
return status
}

View File

@@ -98,6 +98,7 @@ type AdminServer struct {
// Maintenance system
maintenanceManager *maintenance.MaintenanceManager
plugin *adminplugin.Plugin
pluginLock *AdminLockManager
expireJobHandler func(jobID string, reason string) (*adminplugin.TrackedJob, bool, error)
// Topic retention purger
@@ -135,6 +136,8 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
ctx := context.Background()
go masterClient.KeepConnectedToMaster(ctx)
lockManager := NewAdminLockManager(masterClient, adminLockClientName)
server := &AdminServer{
masterClient: masterClient,
templateFS: templateFS,
@@ -146,6 +149,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
collectionStatsCacheThreshold: defaultStatsCacheTimeout,
s3TablesManager: newS3TablesManager(),
icebergPort: icebergPort,
pluginLock: lockManager,
}
// Initialize topic retention purger
@@ -229,6 +233,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
ClusterContextProvider: func(_ context.Context) (*plugin_pb.ClusterContext, error) {
return server.buildDefaultPluginClusterContext(), nil
},
LockManager: lockManager,
})
if err != nil && dataDir != "" {
glog.Warningf("Failed to initialize plugin with dataDir=%q: %v. Falling back to in-memory plugin state.", dataDir, err)
@@ -237,6 +242,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
ClusterContextProvider: func(_ context.Context) (*plugin_pb.ClusterContext, error) {
return server.buildDefaultPluginClusterContext(), nil
},
LockManager: lockManager,
})
}
if err != nil {
@@ -890,6 +896,13 @@ func (s *AdminServer) GetPlugin() *adminplugin.Plugin {
return s.plugin
}
func (s *AdminServer) acquirePluginLock(reason string) (func(), error) {
if s == nil || s.pluginLock == nil {
return func() {}, nil
}
return s.pluginLock.Acquire(reason)
}
// RequestPluginJobTypeDescriptor asks one worker for job type schema and returns the descriptor.
func (s *AdminServer) RequestPluginJobTypeDescriptor(ctx context.Context, jobType string, forceRefresh bool) (*plugin_pb.JobTypeDescriptor, error) {
if s.plugin == nil {
@@ -932,6 +945,13 @@ func (s *AdminServer) RunPluginDetection(
if s.plugin == nil {
return nil, fmt.Errorf("plugin is not enabled")
}
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detection %s", jobType))
if err != nil {
return nil, err
}
if releaseLock != nil {
defer releaseLock()
}
return s.plugin.RunDetection(ctx, jobType, clusterContext, maxResults)
}
@@ -957,6 +977,13 @@ func (s *AdminServer) RunPluginDetectionWithReport(
if s.plugin == nil {
return nil, fmt.Errorf("plugin is not enabled")
}
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detection %s", jobType))
if err != nil {
return nil, err
}
if releaseLock != nil {
defer releaseLock()
}
return s.plugin.RunDetectionWithReport(ctx, jobType, clusterContext, maxResults)
}
@@ -970,6 +997,17 @@ func (s *AdminServer) ExecutePluginJob(
if s.plugin == nil {
return nil, fmt.Errorf("plugin is not enabled")
}
jobType := ""
if job != nil {
jobType = strings.TrimSpace(job.JobType)
}
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin execution %s", jobType))
if err != nil {
return nil, err
}
if releaseLock != nil {
defer releaseLock()
}
return s.plugin.ExecuteJob(ctx, job, clusterContext, attempt)
}

View File

@@ -214,6 +214,27 @@ func (s *AdminServer) GetPluginSchedulerStatesAPI(w http.ResponseWriter, r *http
writeJSON(w, http.StatusOK, states)
}
// GetPluginSchedulerStatusAPI returns scheduler status including in-process jobs and lock state.
func (s *AdminServer) GetPluginSchedulerStatusAPI(w http.ResponseWriter, r *http.Request) {
pluginSvc := s.GetPlugin()
if pluginSvc == nil {
writeJSON(w, http.StatusOK, map[string]interface{}{
"enabled": false,
})
return
}
response := map[string]interface{}{
"enabled": true,
"scheduler": pluginSvc.GetSchedulerStatus(),
}
if s.pluginLock != nil {
response["lock"] = s.pluginLock.Status()
}
writeJSON(w, http.StatusOK, response)
}
// RequestPluginJobTypeSchemaAPI asks a worker for one job type schema.
func (s *AdminServer) RequestPluginJobTypeSchemaAPI(w http.ResponseWriter, r *http.Request) {
jobType := strings.TrimSpace(mux.Vars(r)["jobType"])
@@ -277,6 +298,9 @@ func (s *AdminServer) GetPluginJobTypeConfigAPI(w http.ResponseWriter, r *http.R
AdminRuntime: &plugin_pb.AdminRuntimeConfig{},
}
}
if descriptor, err := s.LoadPluginJobTypeDescriptor(jobType); err == nil && descriptor != nil {
applyDescriptorDefaultsToPersistedConfig(config, descriptor)
}
renderProtoJSON(w, http.StatusOK, config)
}
@@ -455,6 +479,14 @@ func (s *AdminServer) RunPluginJobTypeAPI(w http.ResponseWriter, r *http.Request
writeJSONError(w, http.StatusBadRequest, "jobType is required")
return
}
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detect+execute %s", jobType))
if err != nil {
writeJSONError(w, http.StatusInternalServerError, err.Error())
return
}
if releaseLock != nil {
defer releaseLock()
}
var req struct {
ClusterContext json.RawMessage `json:"cluster_context"`
@@ -771,6 +803,90 @@ func buildJobSpecFromProposal(jobType string, proposal *plugin_pb.JobProposal, i
return jobSpec
}
func applyDescriptorDefaultsToPersistedConfig(
config *plugin_pb.PersistedJobTypeConfig,
descriptor *plugin_pb.JobTypeDescriptor,
) {
if config == nil || descriptor == nil {
return
}
if config.AdminConfigValues == nil {
config.AdminConfigValues = map[string]*plugin_pb.ConfigValue{}
}
if config.WorkerConfigValues == nil {
config.WorkerConfigValues = map[string]*plugin_pb.ConfigValue{}
}
if config.AdminRuntime == nil {
config.AdminRuntime = &plugin_pb.AdminRuntimeConfig{}
}
if descriptor.AdminConfigForm != nil {
for key, value := range descriptor.AdminConfigForm.DefaultValues {
if value == nil {
continue
}
current := config.AdminConfigValues[key]
if current == nil {
config.AdminConfigValues[key] = proto.Clone(value).(*plugin_pb.ConfigValue)
continue
}
if strings.EqualFold(descriptor.JobType, "admin_script") &&
key == "script" &&
isBlankStringConfigValue(current) {
config.AdminConfigValues[key] = proto.Clone(value).(*plugin_pb.ConfigValue)
}
}
}
if descriptor.WorkerConfigForm != nil {
for key, value := range descriptor.WorkerConfigForm.DefaultValues {
if value == nil {
continue
}
if config.WorkerConfigValues[key] != nil {
continue
}
config.WorkerConfigValues[key] = proto.Clone(value).(*plugin_pb.ConfigValue)
}
}
if descriptor.AdminRuntimeDefaults != nil {
runtime := config.AdminRuntime
defaults := descriptor.AdminRuntimeDefaults
if runtime.DetectionIntervalSeconds <= 0 {
runtime.DetectionIntervalSeconds = defaults.DetectionIntervalSeconds
}
if runtime.DetectionTimeoutSeconds <= 0 {
runtime.DetectionTimeoutSeconds = defaults.DetectionTimeoutSeconds
}
if runtime.MaxJobsPerDetection <= 0 {
runtime.MaxJobsPerDetection = defaults.MaxJobsPerDetection
}
if runtime.GlobalExecutionConcurrency <= 0 {
runtime.GlobalExecutionConcurrency = defaults.GlobalExecutionConcurrency
}
if runtime.PerWorkerExecutionConcurrency <= 0 {
runtime.PerWorkerExecutionConcurrency = defaults.PerWorkerExecutionConcurrency
}
if runtime.RetryBackoffSeconds <= 0 {
runtime.RetryBackoffSeconds = defaults.RetryBackoffSeconds
}
if runtime.RetryLimit < 0 {
runtime.RetryLimit = defaults.RetryLimit
}
}
}
func isBlankStringConfigValue(value *plugin_pb.ConfigValue) bool {
if value == nil {
return true
}
kind, ok := value.Kind.(*plugin_pb.ConfigValue_StringValue)
if !ok {
return false
}
return strings.TrimSpace(kind.StringValue) == ""
}
func parsePositiveInt(raw string, defaultValue int) int {
value, err := strconv.Atoi(strings.TrimSpace(raw))
if err != nil || value <= 0 {

View File

@@ -140,3 +140,83 @@ func TestBuildJobSpecFromProposalDoesNotReuseProposalID(t *testing.T) {
t.Fatalf("dedupe key must be preserved: got=%s want=%s", jobA.DedupeKey, proposal.DedupeKey)
}
}
func TestApplyDescriptorDefaultsToPersistedConfigBackfillsAdminDefaults(t *testing.T) {
t.Parallel()
config := &plugin_pb.PersistedJobTypeConfig{
JobType: "admin_script",
AdminConfigValues: map[string]*plugin_pb.ConfigValue{},
WorkerConfigValues: map[string]*plugin_pb.ConfigValue{},
AdminRuntime: &plugin_pb.AdminRuntimeConfig{},
}
descriptor := &plugin_pb.JobTypeDescriptor{
JobType: "admin_script",
AdminConfigForm: &plugin_pb.ConfigForm{
DefaultValues: map[string]*plugin_pb.ConfigValue{
"script": {
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: "volume.balance -apply"},
},
"run_interval_minutes": {
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: 17},
},
},
},
AdminRuntimeDefaults: &plugin_pb.AdminRuntimeDefaults{
DetectionIntervalSeconds: 60,
DetectionTimeoutSeconds: 300,
},
}
applyDescriptorDefaultsToPersistedConfig(config, descriptor)
script := config.AdminConfigValues["script"]
if script == nil {
t.Fatalf("expected script default to be backfilled")
}
scriptKind, ok := script.Kind.(*plugin_pb.ConfigValue_StringValue)
if !ok || scriptKind.StringValue == "" {
t.Fatalf("expected non-empty script default, got=%+v", script)
}
if config.AdminRuntime.DetectionIntervalSeconds != 60 {
t.Fatalf("expected runtime detection interval default to be backfilled")
}
}
func TestApplyDescriptorDefaultsToPersistedConfigReplacesBlankAdminScript(t *testing.T) {
t.Parallel()
config := &plugin_pb.PersistedJobTypeConfig{
JobType: "admin_script",
AdminConfigValues: map[string]*plugin_pb.ConfigValue{
"script": {
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: " "},
},
},
AdminRuntime: &plugin_pb.AdminRuntimeConfig{},
}
descriptor := &plugin_pb.JobTypeDescriptor{
JobType: "admin_script",
AdminConfigForm: &plugin_pb.ConfigForm{
DefaultValues: map[string]*plugin_pb.ConfigValue{
"script": {
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: "volume.fix.replication -apply"},
},
},
},
}
applyDescriptorDefaultsToPersistedConfig(config, descriptor)
script := config.AdminConfigValues["script"]
if script == nil {
t.Fatalf("expected script config value")
}
scriptKind, ok := script.Kind.(*plugin_pb.ConfigValue_StringValue)
if !ok {
t.Fatalf("expected string script config value, got=%T", script.Kind)
}
if scriptKind.StringValue != "volume.fix.replication -apply" {
t.Fatalf("expected blank script to be replaced by default, got=%q", scriptKind.StringValue)
}
}