Expire stuck plugin jobs (#8492)
* Add stale job expiry and expire API * Add expire job button * Add test hook and coverage for ExpirePluginJobAPI * Document scheduler filtering side effect and reuse helper * Restore job spec proposal test * Regenerate plugin template output --------- Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
@@ -98,6 +98,7 @@ type AdminServer struct {
|
|||||||
// Maintenance system
|
// Maintenance system
|
||||||
maintenanceManager *maintenance.MaintenanceManager
|
maintenanceManager *maintenance.MaintenanceManager
|
||||||
plugin *adminplugin.Plugin
|
plugin *adminplugin.Plugin
|
||||||
|
expireJobHandler func(jobID string, reason string) (*adminplugin.TrackedJob, bool, error)
|
||||||
|
|
||||||
// Topic retention purger
|
// Topic retention purger
|
||||||
topicRetentionPurger *TopicRetentionPurger
|
topicRetentionPurger *TopicRetentionPurger
|
||||||
@@ -1020,6 +1021,17 @@ func (s *AdminServer) GetPluginJobDetail(jobID string, activityLimit, relatedLim
|
|||||||
return s.plugin.BuildJobDetail(jobID, activityLimit, relatedLimit)
|
return s.plugin.BuildJobDetail(jobID, activityLimit, relatedLimit)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExpirePluginJob marks an active plugin job as failed so it no longer blocks scheduling.
|
||||||
|
func (s *AdminServer) ExpirePluginJob(jobID, reason string) (*adminplugin.TrackedJob, bool, error) {
|
||||||
|
if handler := s.expireJobHandler; handler != nil {
|
||||||
|
return handler(jobID, reason)
|
||||||
|
}
|
||||||
|
if s.plugin == nil {
|
||||||
|
return nil, false, fmt.Errorf("plugin is not enabled")
|
||||||
|
}
|
||||||
|
return s.plugin.ExpireJob(jobID, reason)
|
||||||
|
}
|
||||||
|
|
||||||
// ListPluginActivities returns plugin job activities for monitoring.
|
// ListPluginActivities returns plugin job activities for monitoring.
|
||||||
func (s *AdminServer) ListPluginActivities(jobType string, limit int) []adminplugin.JobActivity {
|
func (s *AdminServer) ListPluginActivities(jobType string, limit int) []adminplugin.JobActivity {
|
||||||
if s.plugin == nil {
|
if s.plugin == nil {
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
@@ -130,6 +131,47 @@ func (s *AdminServer) GetPluginJobDetailAPI(w http.ResponseWriter, r *http.Reque
|
|||||||
writeJSON(w, http.StatusOK, detail)
|
writeJSON(w, http.StatusOK, detail)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExpirePluginJobAPI marks a job as failed so it no longer blocks scheduling.
|
||||||
|
func (s *AdminServer) ExpirePluginJobAPI(w http.ResponseWriter, r *http.Request) {
|
||||||
|
jobID := strings.TrimSpace(mux.Vars(r)["jobId"])
|
||||||
|
if jobID == "" {
|
||||||
|
writeJSONError(w, http.StatusBadRequest, "jobId is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var req struct {
|
||||||
|
Reason string `json:"reason"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := decodeJSONBody(newJSONMaxReader(w, r), &req); err != nil && err != io.EOF {
|
||||||
|
writeJSONError(w, http.StatusBadRequest, "invalid request body: "+err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
job, expired, err := s.ExpirePluginJob(jobID, req.Reason)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, plugin.ErrJobNotFound) {
|
||||||
|
writeJSONError(w, http.StatusNotFound, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSONError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
response := map[string]interface{}{
|
||||||
|
"job_id": jobID,
|
||||||
|
"expired": expired,
|
||||||
|
}
|
||||||
|
if job != nil {
|
||||||
|
response["job"] = job
|
||||||
|
}
|
||||||
|
if !expired {
|
||||||
|
response["message"] = "job is not active"
|
||||||
|
}
|
||||||
|
|
||||||
|
writeJSON(w, http.StatusOK, response)
|
||||||
|
}
|
||||||
|
|
||||||
// GetPluginActivitiesAPI returns recent plugin activities.
|
// GetPluginActivitiesAPI returns recent plugin activities.
|
||||||
func (s *AdminServer) GetPluginActivitiesAPI(w http.ResponseWriter, r *http.Request) {
|
func (s *AdminServer) GetPluginActivitiesAPI(w http.ResponseWriter, r *http.Request) {
|
||||||
query := r.URL.Query()
|
query := r.URL.Query()
|
||||||
|
|||||||
@@ -1,11 +1,120 @@
|
|||||||
package dash
|
package dash
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/gorilla/mux"
|
||||||
|
"github.com/seaweedfs/seaweedfs/weed/admin/plugin"
|
||||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func TestExpirePluginJobAPI(t *testing.T) {
|
||||||
|
makeRequest := func(adminServer *AdminServer, jobID string, body io.Reader) *httptest.ResponseRecorder {
|
||||||
|
req := httptest.NewRequest(http.MethodPost, "/api/plugin/jobs/"+jobID+"/expire", body)
|
||||||
|
req = mux.SetURLVars(req, map[string]string{"jobId": jobID})
|
||||||
|
recorder := httptest.NewRecorder()
|
||||||
|
adminServer.ExpirePluginJobAPI(recorder, req)
|
||||||
|
return recorder
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("empty job id", func(t *testing.T) {
|
||||||
|
recorder := makeRequest(&AdminServer{}, "", nil)
|
||||||
|
if recorder.Code != http.StatusBadRequest {
|
||||||
|
t.Fatalf("expected 400, got %d", recorder.Code)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("invalid json", func(t *testing.T) {
|
||||||
|
recorder := makeRequest(&AdminServer{}, "job-id", strings.NewReader("{"))
|
||||||
|
if recorder.Code != http.StatusBadRequest {
|
||||||
|
t.Fatalf("expected 400, got %d", recorder.Code)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("job not found", func(t *testing.T) {
|
||||||
|
adminServer := &AdminServer{
|
||||||
|
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
|
||||||
|
return nil, false, plugin.ErrJobNotFound
|
||||||
|
},
|
||||||
|
}
|
||||||
|
recorder := makeRequest(adminServer, "missing", strings.NewReader(`{"reason":"nope"}`))
|
||||||
|
if recorder.Code != http.StatusNotFound {
|
||||||
|
t.Fatalf("expected 404, got %d", recorder.Code)
|
||||||
|
}
|
||||||
|
var payload map[string]any
|
||||||
|
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
|
||||||
|
t.Fatalf("failed to unmarshal body: %v", err)
|
||||||
|
}
|
||||||
|
if payload["error"] == nil {
|
||||||
|
t.Fatalf("expected error payload, got %v", payload)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("successful expire", func(t *testing.T) {
|
||||||
|
expected := &plugin.TrackedJob{JobID: "foo", State: "assigned"}
|
||||||
|
adminServer := &AdminServer{
|
||||||
|
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
|
||||||
|
if jobID != "foo" {
|
||||||
|
return nil, false, errors.New("unexpected")
|
||||||
|
}
|
||||||
|
return expected, true, nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
recorder := makeRequest(adminServer, "foo", strings.NewReader(`{"reason":"cleanup"}`))
|
||||||
|
if recorder.Code != http.StatusOK {
|
||||||
|
t.Fatalf("expected 200, got %d", recorder.Code)
|
||||||
|
}
|
||||||
|
var payload map[string]any
|
||||||
|
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
|
||||||
|
t.Fatalf("failed to decode payload: %v", err)
|
||||||
|
}
|
||||||
|
if payload["job_id"] != "foo" {
|
||||||
|
t.Fatalf("expected job_id foo, got %v", payload["job_id"])
|
||||||
|
}
|
||||||
|
if expired, ok := payload["expired"].(bool); !ok || !expired {
|
||||||
|
t.Fatalf("expected expired=true, got %v", payload["expired"])
|
||||||
|
}
|
||||||
|
jobData, ok := payload["job"].(map[string]any)
|
||||||
|
if !ok || jobData["job_id"] != "foo" {
|
||||||
|
t.Fatalf("expected job info with job_id, got %v", payload["job"])
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("non-active job", func(t *testing.T) {
|
||||||
|
adminServer := &AdminServer{
|
||||||
|
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
|
||||||
|
return nil, false, nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
recorder := makeRequest(adminServer, "bar", strings.NewReader(`{"reason":"ignore"}`))
|
||||||
|
if recorder.Code != http.StatusOK {
|
||||||
|
t.Fatalf("expected 200, got %d", recorder.Code)
|
||||||
|
}
|
||||||
|
var payload map[string]any
|
||||||
|
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
|
||||||
|
t.Fatalf("failed to decode payload: %v", err)
|
||||||
|
}
|
||||||
|
if payload["job_id"] != "bar" {
|
||||||
|
t.Fatalf("expected job_id bar, got %v", payload["job_id"])
|
||||||
|
}
|
||||||
|
if expired, ok := payload["expired"].(bool); !ok || expired {
|
||||||
|
t.Fatalf("expected expired=false, got %v", payload["expired"])
|
||||||
|
}
|
||||||
|
if payload["message"] != "job is not active" {
|
||||||
|
t.Fatalf("expected message job is not active, got %v", payload["message"])
|
||||||
|
}
|
||||||
|
if _, exists := payload["job"]; exists {
|
||||||
|
t.Fatalf("expected no job payload for non-active job, got %v", payload["job"])
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildJobSpecFromProposalDoesNotReuseProposalID(t *testing.T) {
|
func TestBuildJobSpecFromProposalDoesNotReuseProposalID(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -242,6 +242,7 @@ func (h *AdminHandlers) registerAPIRoutes(api *mux.Router, enforceWrite bool) {
|
|||||||
pluginApi.Handle("/job-types/{jobType}/detect", wrapWrite(h.adminServer.TriggerPluginDetectionAPI)).Methods(http.MethodPost)
|
pluginApi.Handle("/job-types/{jobType}/detect", wrapWrite(h.adminServer.TriggerPluginDetectionAPI)).Methods(http.MethodPost)
|
||||||
pluginApi.Handle("/job-types/{jobType}/run", wrapWrite(h.adminServer.RunPluginJobTypeAPI)).Methods(http.MethodPost)
|
pluginApi.Handle("/job-types/{jobType}/run", wrapWrite(h.adminServer.RunPluginJobTypeAPI)).Methods(http.MethodPost)
|
||||||
pluginApi.Handle("/jobs/execute", wrapWrite(h.adminServer.ExecutePluginJobAPI)).Methods(http.MethodPost)
|
pluginApi.Handle("/jobs/execute", wrapWrite(h.adminServer.ExecutePluginJobAPI)).Methods(http.MethodPost)
|
||||||
|
pluginApi.Handle("/jobs/{jobId}/expire", wrapWrite(h.adminServer.ExpirePluginJobAPI)).Methods(http.MethodPost)
|
||||||
|
|
||||||
mqApi := api.PathPrefix("/mq").Subrouter()
|
mqApi := api.PathPrefix("/mq").Subrouter()
|
||||||
mqApi.HandleFunc("/topics/{namespace}/{topic}", h.mqHandlers.GetTopicDetailsAPI).Methods(http.MethodGet)
|
mqApi.HandleFunc("/topics/{namespace}/{topic}", h.mqHandlers.GetTopicDetailsAPI).Methods(http.MethodGet)
|
||||||
|
|||||||
@@ -21,6 +21,9 @@ func TestSetupRoutes_RegistersPluginSchedulerStatesAPI_NoAuth(t *testing.T) {
|
|||||||
if !hasRoute(router, http.MethodGet, "/api/plugin/jobs/example/detail") {
|
if !hasRoute(router, http.MethodGet, "/api/plugin/jobs/example/detail") {
|
||||||
t.Fatalf("expected GET /api/plugin/jobs/:jobId/detail to be registered in no-auth mode")
|
t.Fatalf("expected GET /api/plugin/jobs/:jobId/detail to be registered in no-auth mode")
|
||||||
}
|
}
|
||||||
|
if !hasRoute(router, http.MethodPost, "/api/plugin/jobs/example/expire") {
|
||||||
|
t.Fatalf("expected POST /api/plugin/jobs/:jobId/expire to be registered in no-auth mode")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSetupRoutes_RegistersPluginSchedulerStatesAPI_WithAuth(t *testing.T) {
|
func TestSetupRoutes_RegistersPluginSchedulerStatesAPI_WithAuth(t *testing.T) {
|
||||||
@@ -34,6 +37,9 @@ func TestSetupRoutes_RegistersPluginSchedulerStatesAPI_WithAuth(t *testing.T) {
|
|||||||
if !hasRoute(router, http.MethodGet, "/api/plugin/jobs/example/detail") {
|
if !hasRoute(router, http.MethodGet, "/api/plugin/jobs/example/detail") {
|
||||||
t.Fatalf("expected GET /api/plugin/jobs/:jobId/detail to be registered in auth mode")
|
t.Fatalf("expected GET /api/plugin/jobs/:jobId/detail to be registered in auth mode")
|
||||||
}
|
}
|
||||||
|
if !hasRoute(router, http.MethodPost, "/api/plugin/jobs/example/expire") {
|
||||||
|
t.Fatalf("expected POST /api/plugin/jobs/:jobId/expire to be registered in auth mode")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSetupRoutes_RegistersPluginPages_NoAuth(t *testing.T) {
|
func TestSetupRoutes_RegistersPluginPages_NoAuth(t *testing.T) {
|
||||||
|
|||||||
@@ -80,6 +80,8 @@ type Plugin struct {
|
|||||||
|
|
||||||
jobsMu sync.RWMutex
|
jobsMu sync.RWMutex
|
||||||
jobs map[string]*TrackedJob
|
jobs map[string]*TrackedJob
|
||||||
|
// serialize stale job cleanup to avoid duplicate expirations
|
||||||
|
staleJobsMu sync.Mutex
|
||||||
|
|
||||||
jobDetailsMu sync.Mutex
|
jobDetailsMu sync.Mutex
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package plugin
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -9,12 +10,18 @@ import (
|
|||||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||||
"google.golang.org/protobuf/encoding/protojson"
|
"google.golang.org/protobuf/encoding/protojson"
|
||||||
|
"google.golang.org/protobuf/types/known/timestamppb"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
maxTrackedJobsTotal = 1000
|
maxTrackedJobsTotal = 1000
|
||||||
maxActivityRecords = 4000
|
maxActivityRecords = 4000
|
||||||
maxRelatedJobs = 100
|
maxRelatedJobs = 100
|
||||||
|
|
||||||
|
// stale active jobs block dedupe and scheduling; use generous defaults to
|
||||||
|
// avoid expiring legitimate long-running tasks.
|
||||||
|
defaultStaleActiveJobTimeout = 24 * time.Hour
|
||||||
|
defaultOrphanedActiveJobTimeout = 15 * time.Minute
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -23,6 +30,14 @@ var (
|
|||||||
StateCanceled = strings.ToLower(plugin_pb.JobState_JOB_STATE_CANCELED.String())
|
StateCanceled = strings.ToLower(plugin_pb.JobState_JOB_STATE_CANCELED.String())
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type activeJobSnapshot struct {
|
||||||
|
jobID string
|
||||||
|
jobType string
|
||||||
|
workerID string
|
||||||
|
requestID string
|
||||||
|
lastUpdate time.Time
|
||||||
|
}
|
||||||
|
|
||||||
// activityLess reports whether activity a occurred after activity b (newest-first order).
|
// activityLess reports whether activity a occurred after activity b (newest-first order).
|
||||||
// A nil OccurredAt is treated as the zero time.
|
// A nil OccurredAt is treated as the zero time.
|
||||||
func activityLess(a, b JobActivity) bool {
|
func activityLess(a, b JobActivity) bool {
|
||||||
@@ -54,6 +69,13 @@ func (r *Plugin) loadPersistedMonitorState() error {
|
|||||||
if strings.TrimSpace(job.JobID) == "" {
|
if strings.TrimSpace(job.JobID) == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if isActiveTrackedJobState(job.State) {
|
||||||
|
if detail, detailErr := r.store.LoadJobDetail(job.JobID); detailErr != nil {
|
||||||
|
glog.Warningf("Plugin failed to load detail snapshot for job %s: %v", job.JobID, detailErr)
|
||||||
|
} else if detail != nil {
|
||||||
|
mergeTerminalDetailIntoTracked(&job, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
// Backward compatibility: migrate older inline detail payloads
|
// Backward compatibility: migrate older inline detail payloads
|
||||||
// out of tracked_jobs.json into dedicated per-job detail files.
|
// out of tracked_jobs.json into dedicated per-job detail files.
|
||||||
if hasTrackedJobRichDetails(job) {
|
if hasTrackedJobRichDetails(job) {
|
||||||
@@ -81,6 +103,265 @@ func (r *Plugin) loadPersistedMonitorState() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExpireJob marks an active job as failed so it no longer blocks scheduling.
|
||||||
|
func (r *Plugin) ExpireJob(jobID, reason string) (*TrackedJob, bool, error) {
|
||||||
|
normalizedJobID := strings.TrimSpace(jobID)
|
||||||
|
if normalizedJobID == "" {
|
||||||
|
return nil, false, ErrJobNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
reason = strings.TrimSpace(reason)
|
||||||
|
if reason == "" {
|
||||||
|
reason = "job expired by admin request"
|
||||||
|
}
|
||||||
|
|
||||||
|
var jobType string
|
||||||
|
var requestID string
|
||||||
|
active := false
|
||||||
|
|
||||||
|
r.jobsMu.RLock()
|
||||||
|
if tracked := r.jobs[normalizedJobID]; tracked != nil {
|
||||||
|
jobType = tracked.JobType
|
||||||
|
requestID = tracked.RequestID
|
||||||
|
active = isActiveTrackedJobState(tracked.State)
|
||||||
|
}
|
||||||
|
r.jobsMu.RUnlock()
|
||||||
|
|
||||||
|
if jobType == "" || requestID == "" || !active {
|
||||||
|
if detail, err := r.store.LoadJobDetail(normalizedJobID); err != nil {
|
||||||
|
return nil, false, err
|
||||||
|
} else if detail != nil {
|
||||||
|
if jobType == "" {
|
||||||
|
jobType = detail.JobType
|
||||||
|
}
|
||||||
|
if requestID == "" {
|
||||||
|
requestID = detail.RequestID
|
||||||
|
}
|
||||||
|
if !active && isActiveTrackedJobState(detail.State) {
|
||||||
|
active = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if jobType == "" {
|
||||||
|
return nil, false, ErrJobNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
if !active {
|
||||||
|
current, _ := r.GetTrackedJob(normalizedJobID)
|
||||||
|
if current == nil {
|
||||||
|
if detail, err := r.store.LoadJobDetail(normalizedJobID); err == nil && detail != nil {
|
||||||
|
clone := cloneTrackedJob(*detail)
|
||||||
|
current = &clone
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return current, false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
r.handleJobCompleted(&plugin_pb.JobCompleted{
|
||||||
|
JobId: normalizedJobID,
|
||||||
|
JobType: jobType,
|
||||||
|
RequestId: requestID,
|
||||||
|
Success: false,
|
||||||
|
ErrorMessage: reason,
|
||||||
|
CompletedAt: timestamppb.New(now),
|
||||||
|
})
|
||||||
|
r.appendActivity(JobActivity{
|
||||||
|
JobID: normalizedJobID,
|
||||||
|
JobType: jobType,
|
||||||
|
RequestID: requestID,
|
||||||
|
Source: "admin_expire",
|
||||||
|
Message: reason,
|
||||||
|
Stage: "expired",
|
||||||
|
OccurredAt: timeToPtr(now),
|
||||||
|
})
|
||||||
|
|
||||||
|
updated, _ := r.GetTrackedJob(normalizedJobID)
|
||||||
|
return updated, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// expireStaleJobs marks stale active jobs as failed so they stop blocking new work.
|
||||||
|
func (r *Plugin) expireStaleJobs(now time.Time) int {
|
||||||
|
if now.IsZero() {
|
||||||
|
now = time.Now().UTC()
|
||||||
|
}
|
||||||
|
|
||||||
|
r.staleJobsMu.Lock()
|
||||||
|
defer r.staleJobsMu.Unlock()
|
||||||
|
|
||||||
|
snapshots := r.snapshotActiveJobs()
|
||||||
|
if len(snapshots) == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
expired := 0
|
||||||
|
for _, snap := range snapshots {
|
||||||
|
if snap.lastUpdate.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if stale, _, _ := r.evaluateStaleJob(now, snap.workerID, snap.lastUpdate); !stale {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
reason := r.confirmStaleReason(now, snap.jobID)
|
||||||
|
if reason == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
r.handleJobCompleted(&plugin_pb.JobCompleted{
|
||||||
|
JobId: snap.jobID,
|
||||||
|
JobType: snap.jobType,
|
||||||
|
RequestId: snap.requestID,
|
||||||
|
Success: false,
|
||||||
|
ErrorMessage: reason,
|
||||||
|
CompletedAt: timestamppb.New(now),
|
||||||
|
})
|
||||||
|
expired++
|
||||||
|
}
|
||||||
|
|
||||||
|
return expired
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Plugin) snapshotActiveJobs() []activeJobSnapshot {
|
||||||
|
r.jobsMu.RLock()
|
||||||
|
defer r.jobsMu.RUnlock()
|
||||||
|
|
||||||
|
if len(r.jobs) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]activeJobSnapshot, 0, len(r.jobs))
|
||||||
|
for _, job := range r.jobs {
|
||||||
|
if job == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isActiveTrackedJobState(job.State) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, activeJobSnapshot{
|
||||||
|
jobID: job.JobID,
|
||||||
|
jobType: job.JobType,
|
||||||
|
workerID: job.WorkerID,
|
||||||
|
requestID: job.RequestID,
|
||||||
|
lastUpdate: jobLastUpdated(job),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func jobLastUpdated(job *TrackedJob) time.Time {
|
||||||
|
if job == nil {
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
if job.UpdatedAt != nil && !job.UpdatedAt.IsZero() {
|
||||||
|
return *job.UpdatedAt
|
||||||
|
}
|
||||||
|
if job.CreatedAt != nil && !job.CreatedAt.IsZero() {
|
||||||
|
return *job.CreatedAt
|
||||||
|
}
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Plugin) evaluateStaleJob(now time.Time, workerID string, lastUpdate time.Time) (bool, time.Duration, string) {
|
||||||
|
if lastUpdate.IsZero() {
|
||||||
|
return false, 0, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
timeout := defaultStaleActiveJobTimeout
|
||||||
|
reason := fmt.Sprintf("job expired after %s without progress", timeout)
|
||||||
|
|
||||||
|
workerID = strings.TrimSpace(workerID)
|
||||||
|
if workerID == "" {
|
||||||
|
reason = fmt.Sprintf("job expired after %s without executor assignment", timeout)
|
||||||
|
} else if !r.isWorkerAvailable(workerID) {
|
||||||
|
timeout = defaultOrphanedActiveJobTimeout
|
||||||
|
reason = fmt.Sprintf("job expired after %s without worker heartbeat (worker=%s)", timeout, workerID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if now.Sub(lastUpdate) < timeout {
|
||||||
|
return false, timeout, reason
|
||||||
|
}
|
||||||
|
return true, timeout, reason
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Plugin) confirmStaleReason(now time.Time, jobID string) string {
|
||||||
|
r.jobsMu.RLock()
|
||||||
|
job := r.jobs[jobID]
|
||||||
|
if job == nil || !isActiveTrackedJobState(job.State) {
|
||||||
|
r.jobsMu.RUnlock()
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
lastUpdate := jobLastUpdated(job)
|
||||||
|
workerID := job.WorkerID
|
||||||
|
r.jobsMu.RUnlock()
|
||||||
|
|
||||||
|
stale, _, reason := r.evaluateStaleJob(now, workerID, lastUpdate)
|
||||||
|
if !stale {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return reason
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Plugin) isWorkerAvailable(workerID string) bool {
|
||||||
|
workerID = strings.TrimSpace(workerID)
|
||||||
|
if workerID == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
_, ok := r.registry.Get(workerID)
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func isTerminalTrackedJobState(state string) bool {
|
||||||
|
normalized := strings.ToLower(strings.TrimSpace(state))
|
||||||
|
switch normalized {
|
||||||
|
case StateSucceeded, StateFailed, StateCanceled:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeTerminalDetailIntoTracked(tracked *TrackedJob, detail *TrackedJob) {
|
||||||
|
if tracked == nil || detail == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !isTerminalTrackedJobState(detail.State) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !isActiveTrackedJobState(tracked.State) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if detail.State != "" {
|
||||||
|
tracked.State = detail.State
|
||||||
|
}
|
||||||
|
if detail.Progress != 0 {
|
||||||
|
tracked.Progress = detail.Progress
|
||||||
|
}
|
||||||
|
if detail.Stage != "" {
|
||||||
|
tracked.Stage = detail.Stage
|
||||||
|
}
|
||||||
|
if detail.Message != "" {
|
||||||
|
tracked.Message = detail.Message
|
||||||
|
}
|
||||||
|
if detail.ErrorMessage != "" {
|
||||||
|
tracked.ErrorMessage = detail.ErrorMessage
|
||||||
|
}
|
||||||
|
if detail.ResultSummary != "" {
|
||||||
|
tracked.ResultSummary = detail.ResultSummary
|
||||||
|
}
|
||||||
|
if detail.CompletedAt != nil && !detail.CompletedAt.IsZero() {
|
||||||
|
tracked.CompletedAt = detail.CompletedAt
|
||||||
|
}
|
||||||
|
if detail.UpdatedAt != nil && !detail.UpdatedAt.IsZero() {
|
||||||
|
tracked.UpdatedAt = detail.UpdatedAt
|
||||||
|
}
|
||||||
|
if tracked.UpdatedAt == nil && tracked.CompletedAt != nil {
|
||||||
|
tracked.UpdatedAt = tracked.CompletedAt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (r *Plugin) ListTrackedJobs(jobType string, state string, limit int) []TrackedJob {
|
func (r *Plugin) ListTrackedJobs(jobType string, state string, limit int) []TrackedJob {
|
||||||
r.jobsMu.RLock()
|
r.jobsMu.RLock()
|
||||||
defer r.jobsMu.RUnlock()
|
defer r.jobsMu.RUnlock()
|
||||||
|
|||||||
@@ -61,6 +61,8 @@ func (r *Plugin) schedulerLoop() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (r *Plugin) runSchedulerTick() {
|
func (r *Plugin) runSchedulerTick() {
|
||||||
|
r.expireStaleJobs(time.Now().UTC())
|
||||||
|
|
||||||
jobTypes := r.registry.DetectableJobTypes()
|
jobTypes := r.registry.DetectableJobTypes()
|
||||||
if len(jobTypes) == 0 {
|
if len(jobTypes) == 0 {
|
||||||
return
|
return
|
||||||
@@ -839,11 +841,16 @@ func waitForShutdownOrTimer(shutdown <-chan struct{}, duration time.Duration) bo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// filterProposalsWithActiveJobs removes proposals whose dedupe keys already have active jobs.
|
||||||
|
// It first expires stale tracked jobs via expireStaleJobs, which can mutate scheduler state,
|
||||||
|
// so callers should treat this method as a stateful operation.
|
||||||
func (r *Plugin) filterProposalsWithActiveJobs(jobType string, proposals []*plugin_pb.JobProposal) ([]*plugin_pb.JobProposal, int) {
|
func (r *Plugin) filterProposalsWithActiveJobs(jobType string, proposals []*plugin_pb.JobProposal) ([]*plugin_pb.JobProposal, int) {
|
||||||
if len(proposals) == 0 {
|
if len(proposals) == 0 {
|
||||||
return proposals, 0
|
return proposals, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
r.expireStaleJobs(time.Now().UTC())
|
||||||
|
|
||||||
activeKeys := make(map[string]struct{})
|
activeKeys := make(map[string]struct{})
|
||||||
r.jobsMu.RLock()
|
r.jobsMu.RLock()
|
||||||
for _, job := range r.jobs {
|
for _, job := range r.jobs {
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
package plugin
|
package plugin
|
||||||
|
|
||||||
import "time"
|
import (
|
||||||
|
"errors"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// Keep exactly the last 10 successful and last 10 error runs per job type.
|
// Keep exactly the last 10 successful and last 10 error runs per job type.
|
||||||
@@ -8,6 +11,8 @@ const (
|
|||||||
MaxErrorRunHistory = 10
|
MaxErrorRunHistory = 10
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var ErrJobNotFound = errors.New("job not found")
|
||||||
|
|
||||||
type RunOutcome string
|
type RunOutcome string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|||||||
@@ -485,6 +485,11 @@ templ Plugin(page string) {
|
|||||||
<div class="modal-content">
|
<div class="modal-content">
|
||||||
<div class="modal-header">
|
<div class="modal-header">
|
||||||
<h5 class="modal-title" id="plugin-job-detail-modal-label"><i class="fas fa-file-alt me-2"></i>Job Detail</h5>
|
<h5 class="modal-title" id="plugin-job-detail-modal-label"><i class="fas fa-file-alt me-2"></i>Job Detail</h5>
|
||||||
|
<div class="ms-auto me-2">
|
||||||
|
<button type="button" class="btn btn-outline-danger btn-sm" id="plugin-expire-job-btn" disabled>
|
||||||
|
<i class="fas fa-stop-circle me-1"></i>Expire Job
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||||
</div>
|
</div>
|
||||||
<div class="modal-body" id="plugin-job-detail-content">
|
<div class="modal-body" id="plugin-job-detail-content">
|
||||||
@@ -1073,6 +1078,66 @@ templ Plugin(page string) {
|
|||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isActiveJobState(candidateState) {
|
||||||
|
var jobState = candidateState;
|
||||||
|
if (candidateState && typeof candidateState === 'object' && candidateState.state !== undefined) {
|
||||||
|
jobState = candidateState.state;
|
||||||
|
}
|
||||||
|
var st = String(jobState || '').toLowerCase();
|
||||||
|
return st === 'job_state_pending' || st === 'job_state_assigned' || st === 'job_state_running' ||
|
||||||
|
st === 'pending' || st === 'assigned' || st === 'running' || st === 'in_progress';
|
||||||
|
}
|
||||||
|
|
||||||
|
function setExpireButtonState(job) {
|
||||||
|
var expireBtn = document.getElementById('plugin-expire-job-btn');
|
||||||
|
if (!expireBtn) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
var jobID = job && job.job_id ? String(job.job_id) : '';
|
||||||
|
var active = isActiveJobState(job);
|
||||||
|
expireBtn.setAttribute('data-job-id', jobID);
|
||||||
|
expireBtn.disabled = !jobID || !active;
|
||||||
|
if (!jobID) {
|
||||||
|
expireBtn.title = 'Select a job to expire.';
|
||||||
|
} else if (!active) {
|
||||||
|
expireBtn.title = 'Job is not active.';
|
||||||
|
} else {
|
||||||
|
expireBtn.title = 'Expire job to unblock scheduling.';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function expireJob(jobID) {
|
||||||
|
var normalizedJobID = String(jobID || '').trim();
|
||||||
|
if (!normalizedJobID) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var reason = window.prompt('Expire job ' + normalizedJobID + '? Optional reason:', 'job expired by admin request');
|
||||||
|
if (reason === null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var expireBtn = document.getElementById('plugin-expire-job-btn');
|
||||||
|
if (expireBtn) {
|
||||||
|
expireBtn.disabled = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
var response = await pluginRequest('POST', '/api/plugin/jobs/' + encodePath(normalizedJobID) + '/expire', {
|
||||||
|
reason: reason,
|
||||||
|
});
|
||||||
|
if (response && response.expired === false) {
|
||||||
|
notify(response.message || 'Job is not active.', 'info');
|
||||||
|
} else {
|
||||||
|
notify('Job expired: ' + normalizedJobID, 'success');
|
||||||
|
}
|
||||||
|
await refreshJobsAndActivities();
|
||||||
|
await openJobDetail(normalizedJobID);
|
||||||
|
} catch (e) {
|
||||||
|
notify('Failed to expire job: ' + e.message, 'error');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function openJobDetail(jobID) {
|
async function openJobDetail(jobID) {
|
||||||
var normalizedJobID = String(jobID || '').trim();
|
var normalizedJobID = String(jobID || '').trim();
|
||||||
if (!normalizedJobID) {
|
if (!normalizedJobID) {
|
||||||
@@ -1093,10 +1158,12 @@ templ Plugin(page string) {
|
|||||||
modal.show();
|
modal.show();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setExpireButtonState(null);
|
||||||
contentRoot.innerHTML = '<div class="text-muted">Loading job detail...</div>';
|
contentRoot.innerHTML = '<div class="text-muted">Loading job detail...</div>';
|
||||||
try {
|
try {
|
||||||
var detail = await pluginRequest('GET', '/api/plugin/jobs/' + encodePath(normalizedJobID) + '/detail?activity_limit=500&related_limit=20');
|
var detail = await pluginRequest('GET', '/api/plugin/jobs/' + encodePath(normalizedJobID) + '/detail?activity_limit=500&related_limit=20');
|
||||||
var job = (detail && detail.job) ? detail.job : {};
|
var job = (detail && detail.job) ? detail.job : {};
|
||||||
|
setExpireButtonState(job);
|
||||||
var runRecord = detail && detail.run_record ? detail.run_record : null;
|
var runRecord = detail && detail.run_record ? detail.run_record : null;
|
||||||
var activities = (detail && Array.isArray(detail.activities)) ? detail.activities : [];
|
var activities = (detail && Array.isArray(detail.activities)) ? detail.activities : [];
|
||||||
var relatedJobs = (detail && Array.isArray(detail.related_jobs)) ? detail.related_jobs : [];
|
var relatedJobs = (detail && Array.isArray(detail.related_jobs)) ? detail.related_jobs : [];
|
||||||
@@ -1197,6 +1264,7 @@ templ Plugin(page string) {
|
|||||||
|
|
||||||
contentRoot.innerHTML = html;
|
contentRoot.innerHTML = html;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
setExpireButtonState(null);
|
||||||
contentRoot.innerHTML = '<div class="alert alert-danger mb-0">Failed to load job detail: ' + escapeHtml(e.message) + '</div>';
|
contentRoot.innerHTML = '<div class="alert alert-danger mb-0">Failed to load job detail: ' + escapeHtml(e.message) + '</div>';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1238,8 +1306,7 @@ templ Plugin(page string) {
|
|||||||
var allActivities = Array.isArray(state.allActivities) ? state.allActivities : [];
|
var allActivities = Array.isArray(state.allActivities) ? state.allActivities : [];
|
||||||
|
|
||||||
var activeCount = allJobs.filter(function(job) {
|
var activeCount = allJobs.filter(function(job) {
|
||||||
var st = String(job.state || '').toLowerCase();
|
return isActiveJobState(job);
|
||||||
return st === 'job_state_pending' || st === 'job_state_assigned' || st === 'job_state_running' || st === 'pending' || st === 'assigned' || st === 'running' || st === 'in_progress';
|
|
||||||
}).length;
|
}).length;
|
||||||
|
|
||||||
document.getElementById('plugin-status-workers').textContent = String(state.workers.length);
|
document.getElementById('plugin-status-workers').textContent = String(state.workers.length);
|
||||||
@@ -1265,8 +1332,7 @@ templ Plugin(page string) {
|
|||||||
if (!jobType) {
|
if (!jobType) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
var st = String(job.state || '').toLowerCase();
|
var isActive = isActiveJobState(job);
|
||||||
var isActive = st === 'job_state_pending' || st === 'job_state_assigned' || st === 'job_state_running' || st === 'pending' || st === 'assigned' || st === 'running' || st === 'in_progress';
|
|
||||||
if (!isActive) {
|
if (!isActive) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -2778,6 +2844,17 @@ templ Plugin(page string) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var expireBtn = document.getElementById('plugin-expire-job-btn');
|
||||||
|
if (expireBtn) {
|
||||||
|
expireBtn.addEventListener('click', function() {
|
||||||
|
var jobID = String(expireBtn.getAttribute('data-job-id') || '').trim();
|
||||||
|
if (!jobID) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
expireJob(jobID);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
document.getElementById('plugin-refresh-all-btn').addEventListener('click', function() {
|
document.getElementById('plugin-refresh-all-btn').addEventListener('click', function() {
|
||||||
refreshAll();
|
refreshAll();
|
||||||
});
|
});
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user