add admin script worker (#8491)

* admin: add plugin lock coordination

* shell: allow bypassing lock checks

* plugin worker: add admin script handler

* mini: include admin_script in plugin defaults

* admin script UI: drop name and enlarge text

* admin script: add default script

* admin_script: make run interval configurable

* plugin: gate other jobs during admin_script runs

* plugin: use last completed admin_script run

* admin: backfill plugin config defaults

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* comparable to default version

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* default to run

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* format

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* shell: respect pre-set noLock for fix.replication

* shell: add force no-lock mode for admin scripts

* volume balance worker already exists

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* admin: expose scheduler status JSON

* shell: add sleep command

* shell: restrict sleep syntax

* Revert "shell: respect pre-set noLock for fix.replication"

This reverts commit 2b14e8b82602a740d3a473c085e3b3a14f1ddbb3.

* templ

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* fix import

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* less logs

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* Reduce master client logs on canceled contexts

* Update mini default job type count

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Chris Lu
2026-03-03 15:10:40 -08:00
committed by GitHub
parent 16f2269a33
commit e1e5b4a8a6
27 changed files with 1888 additions and 27 deletions

View File

@@ -11,6 +11,8 @@ import (
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb"
@@ -26,6 +28,22 @@ type masterVolumeProvider struct {
masterClient *MasterClient
}
func isCanceledErr(err error) bool {
if err == nil {
return false
}
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return true
}
if statusErr, ok := status.FromError(err); ok {
switch statusErr.Code() {
case codes.Canceled, codes.DeadlineExceeded:
return true
}
}
return false
}
// LookupVolumeIds queries the master for volume locations (fallback when cache misses)
// Returns partial results with aggregated errors for volumes that failed
func (p *masterVolumeProvider) LookupVolumeIds(ctx context.Context, volumeIds []string) (map[string][]Location, error) {
@@ -194,8 +212,13 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
resp, err := stream.Recv()
if err != nil {
glog.V(0).Infof("%s.%s masterClient failed to receive from %s: %v", mc.FilerGroup, mc.clientType, master, err)
stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToReceive).Inc()
canceled := isCanceledErr(err) || ctx.Err() != nil
if canceled {
glog.V(1).Infof("%s.%s masterClient stream closed from %s: %v", mc.FilerGroup, mc.clientType, master, err)
} else {
glog.V(0).Infof("%s.%s masterClient failed to receive from %s: %v", mc.FilerGroup, mc.clientType, master, err)
stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToReceive).Inc()
}
return err
}
@@ -219,8 +242,13 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
for {
resp, err := stream.Recv()
if err != nil {
glog.V(0).Infof("%s.%s masterClient failed to receive from %s: %v", mc.FilerGroup, mc.clientType, master, err)
stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToReceive).Inc()
canceled := isCanceledErr(err) || ctx.Err() != nil
if canceled {
glog.V(1).Infof("%s.%s masterClient stream closed from %s: %v", mc.FilerGroup, mc.clientType, master, err)
} else {
glog.V(0).Infof("%s.%s masterClient failed to receive from %s: %v", mc.FilerGroup, mc.clientType, master, err)
stats.MasterClientConnectCounter.WithLabelValues(stats.FailedToReceive).Inc()
}
return err
}
@@ -252,12 +280,20 @@ func (mc *MasterClient) tryConnectToMaster(ctx context.Context, master pb.Server
mc.OnPeerUpdateLock.RUnlock()
}
if err := ctx.Err(); err != nil {
glog.V(0).Infof("Connection attempt to master stopped: %v", err)
if isCanceledErr(err) {
glog.V(1).Infof("Connection attempt to master stopped: %v", err)
} else {
glog.V(0).Infof("Connection attempt to master stopped: %v", err)
}
return err
}
}
})
if gprcErr != nil {
if isCanceledErr(gprcErr) || ctx.Err() != nil {
glog.V(1).Infof("%s.%s masterClient connection closed to %v: %v", mc.FilerGroup, mc.clientType, master, gprcErr)
return nextHintedLeader
}
stats.MasterClientConnectCounter.WithLabelValues(stats.Failed).Inc()
glog.V(1).Infof("%s.%s masterClient failed to connect with master %v: %v", mc.FilerGroup, mc.clientType, master, gprcErr)
}
@@ -387,7 +423,11 @@ func (mc *MasterClient) KeepConnectedToMaster(ctx context.Context) {
for {
select {
case <-ctx.Done():
glog.V(0).Infof("Connection to masters stopped: %v", ctx.Err())
if isCanceledErr(ctx.Err()) {
glog.V(1).Infof("Connection to masters stopped: %v", ctx.Err())
} else {
glog.V(0).Infof("Connection to masters stopped: %v", ctx.Err())
}
return
default:
reconnectStart := time.Now()