add admin script worker (#8491)
* admin: add plugin lock coordination * shell: allow bypassing lock checks * plugin worker: add admin script handler * mini: include admin_script in plugin defaults * admin script UI: drop name and enlarge text * admin script: add default script * admin_script: make run interval configurable * plugin: gate other jobs during admin_script runs * plugin: use last completed admin_script run * admin: backfill plugin config defaults * templ Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * comparable to default version Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * default to run Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * format Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * shell: respect pre-set noLock for fix.replication * shell: add force no-lock mode for admin scripts * volume balance worker already exists Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * admin: expose scheduler status JSON * shell: add sleep command * shell: restrict sleep syntax * Revert "shell: respect pre-set noLock for fix.replication" This reverts commit 2b14e8b82602a740d3a473c085e3b3a14f1ddbb3. * templ Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * fix import Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * less logs Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * Reduce master client logs on canceled contexts * Update mini default job type count --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
148
weed/admin/dash/admin_lock_manager.go
Normal file
148
weed/admin/dash/admin_lock_manager.go
Normal file
@@ -0,0 +1,148 @@
|
||||
package dash
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient"
|
||||
"github.com/seaweedfs/seaweedfs/weed/wdclient/exclusive_locks"
|
||||
)
|
||||
|
||||
const (
|
||||
adminLockName = "shell"
|
||||
adminLockClientName = "admin-plugin"
|
||||
)
|
||||
|
||||
// AdminLockManager coordinates exclusive admin locks with reference counting.
|
||||
// It is safe for concurrent use.
|
||||
type AdminLockManager struct {
|
||||
locker *exclusive_locks.ExclusiveLocker
|
||||
clientName string
|
||||
|
||||
mu sync.Mutex
|
||||
cond *sync.Cond
|
||||
acquiring bool
|
||||
holdCount int
|
||||
|
||||
lastAcquiredAt time.Time
|
||||
lastReleasedAt time.Time
|
||||
waitingSince time.Time
|
||||
waitingReason string
|
||||
currentReason string
|
||||
}
|
||||
|
||||
func NewAdminLockManager(masterClient *wdclient.MasterClient, clientName string) *AdminLockManager {
|
||||
if masterClient == nil {
|
||||
return nil
|
||||
}
|
||||
if clientName == "" {
|
||||
clientName = adminLockClientName
|
||||
}
|
||||
manager := &AdminLockManager{
|
||||
locker: exclusive_locks.NewExclusiveLocker(masterClient, adminLockName),
|
||||
clientName: clientName,
|
||||
}
|
||||
manager.cond = sync.NewCond(&manager.mu)
|
||||
return manager
|
||||
}
|
||||
|
||||
func (m *AdminLockManager) Acquire(reason string) (func(), error) {
|
||||
if m == nil || m.locker == nil {
|
||||
return func() {}, nil
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
if reason != "" {
|
||||
m.locker.SetMessage(reason)
|
||||
m.currentReason = reason
|
||||
}
|
||||
for m.acquiring {
|
||||
m.cond.Wait()
|
||||
}
|
||||
if m.holdCount == 0 {
|
||||
m.acquiring = true
|
||||
m.waitingSince = time.Now().UTC()
|
||||
m.waitingReason = reason
|
||||
m.mu.Unlock()
|
||||
m.locker.RequestLock(m.clientName)
|
||||
m.mu.Lock()
|
||||
m.acquiring = false
|
||||
m.holdCount = 1
|
||||
m.lastAcquiredAt = time.Now().UTC()
|
||||
m.waitingSince = time.Time{}
|
||||
m.waitingReason = ""
|
||||
m.cond.Broadcast()
|
||||
m.mu.Unlock()
|
||||
return m.Release, nil
|
||||
}
|
||||
m.holdCount++
|
||||
if reason != "" {
|
||||
m.currentReason = reason
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return m.Release, nil
|
||||
}
|
||||
|
||||
func (m *AdminLockManager) Release() {
|
||||
if m == nil || m.locker == nil {
|
||||
return
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
if m.holdCount <= 0 {
|
||||
m.mu.Unlock()
|
||||
return
|
||||
}
|
||||
m.holdCount--
|
||||
shouldRelease := m.holdCount == 0
|
||||
m.mu.Unlock()
|
||||
|
||||
if shouldRelease {
|
||||
m.mu.Lock()
|
||||
m.lastReleasedAt = time.Now().UTC()
|
||||
m.currentReason = ""
|
||||
m.mu.Unlock()
|
||||
m.locker.ReleaseLock()
|
||||
}
|
||||
}
|
||||
|
||||
type LockStatus struct {
|
||||
Held bool `json:"held"`
|
||||
HoldCount int `json:"hold_count"`
|
||||
Acquiring bool `json:"acquiring"`
|
||||
Message string `json:"message,omitempty"`
|
||||
WaitingReason string `json:"waiting_reason,omitempty"`
|
||||
LastAcquiredAt *time.Time `json:"last_acquired_at,omitempty"`
|
||||
LastReleasedAt *time.Time `json:"last_released_at,omitempty"`
|
||||
WaitingSince *time.Time `json:"waiting_since,omitempty"`
|
||||
}
|
||||
|
||||
func (m *AdminLockManager) Status() LockStatus {
|
||||
if m == nil {
|
||||
return LockStatus{}
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
status := LockStatus{
|
||||
Held: m.holdCount > 0,
|
||||
HoldCount: m.holdCount,
|
||||
Acquiring: m.acquiring,
|
||||
Message: m.currentReason,
|
||||
WaitingReason: m.waitingReason,
|
||||
}
|
||||
if !m.lastAcquiredAt.IsZero() {
|
||||
at := m.lastAcquiredAt
|
||||
status.LastAcquiredAt = &at
|
||||
}
|
||||
if !m.lastReleasedAt.IsZero() {
|
||||
at := m.lastReleasedAt
|
||||
status.LastReleasedAt = &at
|
||||
}
|
||||
if !m.waitingSince.IsZero() {
|
||||
at := m.waitingSince
|
||||
status.WaitingSince = &at
|
||||
}
|
||||
return status
|
||||
}
|
||||
@@ -98,6 +98,7 @@ type AdminServer struct {
|
||||
// Maintenance system
|
||||
maintenanceManager *maintenance.MaintenanceManager
|
||||
plugin *adminplugin.Plugin
|
||||
pluginLock *AdminLockManager
|
||||
expireJobHandler func(jobID string, reason string) (*adminplugin.TrackedJob, bool, error)
|
||||
|
||||
// Topic retention purger
|
||||
@@ -135,6 +136,8 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
|
||||
ctx := context.Background()
|
||||
go masterClient.KeepConnectedToMaster(ctx)
|
||||
|
||||
lockManager := NewAdminLockManager(masterClient, adminLockClientName)
|
||||
|
||||
server := &AdminServer{
|
||||
masterClient: masterClient,
|
||||
templateFS: templateFS,
|
||||
@@ -146,6 +149,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
|
||||
collectionStatsCacheThreshold: defaultStatsCacheTimeout,
|
||||
s3TablesManager: newS3TablesManager(),
|
||||
icebergPort: icebergPort,
|
||||
pluginLock: lockManager,
|
||||
}
|
||||
|
||||
// Initialize topic retention purger
|
||||
@@ -229,6 +233,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
|
||||
ClusterContextProvider: func(_ context.Context) (*plugin_pb.ClusterContext, error) {
|
||||
return server.buildDefaultPluginClusterContext(), nil
|
||||
},
|
||||
LockManager: lockManager,
|
||||
})
|
||||
if err != nil && dataDir != "" {
|
||||
glog.Warningf("Failed to initialize plugin with dataDir=%q: %v. Falling back to in-memory plugin state.", dataDir, err)
|
||||
@@ -237,6 +242,7 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
|
||||
ClusterContextProvider: func(_ context.Context) (*plugin_pb.ClusterContext, error) {
|
||||
return server.buildDefaultPluginClusterContext(), nil
|
||||
},
|
||||
LockManager: lockManager,
|
||||
})
|
||||
}
|
||||
if err != nil {
|
||||
@@ -890,6 +896,13 @@ func (s *AdminServer) GetPlugin() *adminplugin.Plugin {
|
||||
return s.plugin
|
||||
}
|
||||
|
||||
func (s *AdminServer) acquirePluginLock(reason string) (func(), error) {
|
||||
if s == nil || s.pluginLock == nil {
|
||||
return func() {}, nil
|
||||
}
|
||||
return s.pluginLock.Acquire(reason)
|
||||
}
|
||||
|
||||
// RequestPluginJobTypeDescriptor asks one worker for job type schema and returns the descriptor.
|
||||
func (s *AdminServer) RequestPluginJobTypeDescriptor(ctx context.Context, jobType string, forceRefresh bool) (*plugin_pb.JobTypeDescriptor, error) {
|
||||
if s.plugin == nil {
|
||||
@@ -932,6 +945,13 @@ func (s *AdminServer) RunPluginDetection(
|
||||
if s.plugin == nil {
|
||||
return nil, fmt.Errorf("plugin is not enabled")
|
||||
}
|
||||
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detection %s", jobType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if releaseLock != nil {
|
||||
defer releaseLock()
|
||||
}
|
||||
return s.plugin.RunDetection(ctx, jobType, clusterContext, maxResults)
|
||||
}
|
||||
|
||||
@@ -957,6 +977,13 @@ func (s *AdminServer) RunPluginDetectionWithReport(
|
||||
if s.plugin == nil {
|
||||
return nil, fmt.Errorf("plugin is not enabled")
|
||||
}
|
||||
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detection %s", jobType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if releaseLock != nil {
|
||||
defer releaseLock()
|
||||
}
|
||||
return s.plugin.RunDetectionWithReport(ctx, jobType, clusterContext, maxResults)
|
||||
}
|
||||
|
||||
@@ -970,6 +997,17 @@ func (s *AdminServer) ExecutePluginJob(
|
||||
if s.plugin == nil {
|
||||
return nil, fmt.Errorf("plugin is not enabled")
|
||||
}
|
||||
jobType := ""
|
||||
if job != nil {
|
||||
jobType = strings.TrimSpace(job.JobType)
|
||||
}
|
||||
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin execution %s", jobType))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if releaseLock != nil {
|
||||
defer releaseLock()
|
||||
}
|
||||
return s.plugin.ExecuteJob(ctx, job, clusterContext, attempt)
|
||||
}
|
||||
|
||||
|
||||
@@ -214,6 +214,27 @@ func (s *AdminServer) GetPluginSchedulerStatesAPI(w http.ResponseWriter, r *http
|
||||
writeJSON(w, http.StatusOK, states)
|
||||
}
|
||||
|
||||
// GetPluginSchedulerStatusAPI returns scheduler status including in-process jobs and lock state.
|
||||
func (s *AdminServer) GetPluginSchedulerStatusAPI(w http.ResponseWriter, r *http.Request) {
|
||||
pluginSvc := s.GetPlugin()
|
||||
if pluginSvc == nil {
|
||||
writeJSON(w, http.StatusOK, map[string]interface{}{
|
||||
"enabled": false,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
response := map[string]interface{}{
|
||||
"enabled": true,
|
||||
"scheduler": pluginSvc.GetSchedulerStatus(),
|
||||
}
|
||||
if s.pluginLock != nil {
|
||||
response["lock"] = s.pluginLock.Status()
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, response)
|
||||
}
|
||||
|
||||
// RequestPluginJobTypeSchemaAPI asks a worker for one job type schema.
|
||||
func (s *AdminServer) RequestPluginJobTypeSchemaAPI(w http.ResponseWriter, r *http.Request) {
|
||||
jobType := strings.TrimSpace(mux.Vars(r)["jobType"])
|
||||
@@ -277,6 +298,9 @@ func (s *AdminServer) GetPluginJobTypeConfigAPI(w http.ResponseWriter, r *http.R
|
||||
AdminRuntime: &plugin_pb.AdminRuntimeConfig{},
|
||||
}
|
||||
}
|
||||
if descriptor, err := s.LoadPluginJobTypeDescriptor(jobType); err == nil && descriptor != nil {
|
||||
applyDescriptorDefaultsToPersistedConfig(config, descriptor)
|
||||
}
|
||||
|
||||
renderProtoJSON(w, http.StatusOK, config)
|
||||
}
|
||||
@@ -455,6 +479,14 @@ func (s *AdminServer) RunPluginJobTypeAPI(w http.ResponseWriter, r *http.Request
|
||||
writeJSONError(w, http.StatusBadRequest, "jobType is required")
|
||||
return
|
||||
}
|
||||
releaseLock, err := s.acquirePluginLock(fmt.Sprintf("plugin detect+execute %s", jobType))
|
||||
if err != nil {
|
||||
writeJSONError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if releaseLock != nil {
|
||||
defer releaseLock()
|
||||
}
|
||||
|
||||
var req struct {
|
||||
ClusterContext json.RawMessage `json:"cluster_context"`
|
||||
@@ -771,6 +803,90 @@ func buildJobSpecFromProposal(jobType string, proposal *plugin_pb.JobProposal, i
|
||||
return jobSpec
|
||||
}
|
||||
|
||||
func applyDescriptorDefaultsToPersistedConfig(
|
||||
config *plugin_pb.PersistedJobTypeConfig,
|
||||
descriptor *plugin_pb.JobTypeDescriptor,
|
||||
) {
|
||||
if config == nil || descriptor == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if config.AdminConfigValues == nil {
|
||||
config.AdminConfigValues = map[string]*plugin_pb.ConfigValue{}
|
||||
}
|
||||
if config.WorkerConfigValues == nil {
|
||||
config.WorkerConfigValues = map[string]*plugin_pb.ConfigValue{}
|
||||
}
|
||||
if config.AdminRuntime == nil {
|
||||
config.AdminRuntime = &plugin_pb.AdminRuntimeConfig{}
|
||||
}
|
||||
|
||||
if descriptor.AdminConfigForm != nil {
|
||||
for key, value := range descriptor.AdminConfigForm.DefaultValues {
|
||||
if value == nil {
|
||||
continue
|
||||
}
|
||||
current := config.AdminConfigValues[key]
|
||||
if current == nil {
|
||||
config.AdminConfigValues[key] = proto.Clone(value).(*plugin_pb.ConfigValue)
|
||||
continue
|
||||
}
|
||||
if strings.EqualFold(descriptor.JobType, "admin_script") &&
|
||||
key == "script" &&
|
||||
isBlankStringConfigValue(current) {
|
||||
config.AdminConfigValues[key] = proto.Clone(value).(*plugin_pb.ConfigValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
if descriptor.WorkerConfigForm != nil {
|
||||
for key, value := range descriptor.WorkerConfigForm.DefaultValues {
|
||||
if value == nil {
|
||||
continue
|
||||
}
|
||||
if config.WorkerConfigValues[key] != nil {
|
||||
continue
|
||||
}
|
||||
config.WorkerConfigValues[key] = proto.Clone(value).(*plugin_pb.ConfigValue)
|
||||
}
|
||||
}
|
||||
if descriptor.AdminRuntimeDefaults != nil {
|
||||
runtime := config.AdminRuntime
|
||||
defaults := descriptor.AdminRuntimeDefaults
|
||||
if runtime.DetectionIntervalSeconds <= 0 {
|
||||
runtime.DetectionIntervalSeconds = defaults.DetectionIntervalSeconds
|
||||
}
|
||||
if runtime.DetectionTimeoutSeconds <= 0 {
|
||||
runtime.DetectionTimeoutSeconds = defaults.DetectionTimeoutSeconds
|
||||
}
|
||||
if runtime.MaxJobsPerDetection <= 0 {
|
||||
runtime.MaxJobsPerDetection = defaults.MaxJobsPerDetection
|
||||
}
|
||||
if runtime.GlobalExecutionConcurrency <= 0 {
|
||||
runtime.GlobalExecutionConcurrency = defaults.GlobalExecutionConcurrency
|
||||
}
|
||||
if runtime.PerWorkerExecutionConcurrency <= 0 {
|
||||
runtime.PerWorkerExecutionConcurrency = defaults.PerWorkerExecutionConcurrency
|
||||
}
|
||||
if runtime.RetryBackoffSeconds <= 0 {
|
||||
runtime.RetryBackoffSeconds = defaults.RetryBackoffSeconds
|
||||
}
|
||||
if runtime.RetryLimit < 0 {
|
||||
runtime.RetryLimit = defaults.RetryLimit
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func isBlankStringConfigValue(value *plugin_pb.ConfigValue) bool {
|
||||
if value == nil {
|
||||
return true
|
||||
}
|
||||
kind, ok := value.Kind.(*plugin_pb.ConfigValue_StringValue)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(kind.StringValue) == ""
|
||||
}
|
||||
|
||||
func parsePositiveInt(raw string, defaultValue int) int {
|
||||
value, err := strconv.Atoi(strings.TrimSpace(raw))
|
||||
if err != nil || value <= 0 {
|
||||
|
||||
@@ -140,3 +140,83 @@ func TestBuildJobSpecFromProposalDoesNotReuseProposalID(t *testing.T) {
|
||||
t.Fatalf("dedupe key must be preserved: got=%s want=%s", jobA.DedupeKey, proposal.DedupeKey)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyDescriptorDefaultsToPersistedConfigBackfillsAdminDefaults(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
config := &plugin_pb.PersistedJobTypeConfig{
|
||||
JobType: "admin_script",
|
||||
AdminConfigValues: map[string]*plugin_pb.ConfigValue{},
|
||||
WorkerConfigValues: map[string]*plugin_pb.ConfigValue{},
|
||||
AdminRuntime: &plugin_pb.AdminRuntimeConfig{},
|
||||
}
|
||||
descriptor := &plugin_pb.JobTypeDescriptor{
|
||||
JobType: "admin_script",
|
||||
AdminConfigForm: &plugin_pb.ConfigForm{
|
||||
DefaultValues: map[string]*plugin_pb.ConfigValue{
|
||||
"script": {
|
||||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: "volume.balance -apply"},
|
||||
},
|
||||
"run_interval_minutes": {
|
||||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: 17},
|
||||
},
|
||||
},
|
||||
},
|
||||
AdminRuntimeDefaults: &plugin_pb.AdminRuntimeDefaults{
|
||||
DetectionIntervalSeconds: 60,
|
||||
DetectionTimeoutSeconds: 300,
|
||||
},
|
||||
}
|
||||
|
||||
applyDescriptorDefaultsToPersistedConfig(config, descriptor)
|
||||
|
||||
script := config.AdminConfigValues["script"]
|
||||
if script == nil {
|
||||
t.Fatalf("expected script default to be backfilled")
|
||||
}
|
||||
scriptKind, ok := script.Kind.(*plugin_pb.ConfigValue_StringValue)
|
||||
if !ok || scriptKind.StringValue == "" {
|
||||
t.Fatalf("expected non-empty script default, got=%+v", script)
|
||||
}
|
||||
if config.AdminRuntime.DetectionIntervalSeconds != 60 {
|
||||
t.Fatalf("expected runtime detection interval default to be backfilled")
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyDescriptorDefaultsToPersistedConfigReplacesBlankAdminScript(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
config := &plugin_pb.PersistedJobTypeConfig{
|
||||
JobType: "admin_script",
|
||||
AdminConfigValues: map[string]*plugin_pb.ConfigValue{
|
||||
"script": {
|
||||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: " "},
|
||||
},
|
||||
},
|
||||
AdminRuntime: &plugin_pb.AdminRuntimeConfig{},
|
||||
}
|
||||
descriptor := &plugin_pb.JobTypeDescriptor{
|
||||
JobType: "admin_script",
|
||||
AdminConfigForm: &plugin_pb.ConfigForm{
|
||||
DefaultValues: map[string]*plugin_pb.ConfigValue{
|
||||
"script": {
|
||||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: "volume.fix.replication -apply"},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
applyDescriptorDefaultsToPersistedConfig(config, descriptor)
|
||||
|
||||
script := config.AdminConfigValues["script"]
|
||||
if script == nil {
|
||||
t.Fatalf("expected script config value")
|
||||
}
|
||||
scriptKind, ok := script.Kind.(*plugin_pb.ConfigValue_StringValue)
|
||||
if !ok {
|
||||
t.Fatalf("expected string script config value, got=%T", script.Kind)
|
||||
}
|
||||
if scriptKind.StringValue != "volume.fix.replication -apply" {
|
||||
t.Fatalf("expected blank script to be replaced by default, got=%q", scriptKind.StringValue)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user