Expire stuck plugin jobs (#8492)

* Add stale job expiry and expire API

* Add expire job button

* Add test hook and coverage for ExpirePluginJobAPI

* Document scheduler filtering side effect and reuse helper

* Restore job spec proposal test

* Regenerate plugin template output

---------

Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Chris Lu
2026-03-03 01:27:25 -08:00
committed by GitHub
parent 3db05f59f0
commit a61a2affe3
11 changed files with 548 additions and 6 deletions

View File

@@ -98,6 +98,7 @@ type AdminServer struct {
// Maintenance system
maintenanceManager *maintenance.MaintenanceManager
plugin *adminplugin.Plugin
expireJobHandler func(jobID string, reason string) (*adminplugin.TrackedJob, bool, error)
// Topic retention purger
topicRetentionPurger *TopicRetentionPurger
@@ -1020,6 +1021,17 @@ func (s *AdminServer) GetPluginJobDetail(jobID string, activityLimit, relatedLim
return s.plugin.BuildJobDetail(jobID, activityLimit, relatedLimit)
}
// ExpirePluginJob marks an active plugin job as failed so it no longer blocks scheduling.
func (s *AdminServer) ExpirePluginJob(jobID, reason string) (*adminplugin.TrackedJob, bool, error) {
if handler := s.expireJobHandler; handler != nil {
return handler(jobID, reason)
}
if s.plugin == nil {
return nil, false, fmt.Errorf("plugin is not enabled")
}
return s.plugin.ExpireJob(jobID, reason)
}
// ListPluginActivities returns plugin job activities for monitoring.
func (s *AdminServer) ListPluginActivities(jobType string, limit int) []adminplugin.JobActivity {
if s.plugin == nil {

View File

@@ -5,6 +5,7 @@ import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
@@ -130,6 +131,47 @@ func (s *AdminServer) GetPluginJobDetailAPI(w http.ResponseWriter, r *http.Reque
writeJSON(w, http.StatusOK, detail)
}
// ExpirePluginJobAPI marks a job as failed so it no longer blocks scheduling.
func (s *AdminServer) ExpirePluginJobAPI(w http.ResponseWriter, r *http.Request) {
jobID := strings.TrimSpace(mux.Vars(r)["jobId"])
if jobID == "" {
writeJSONError(w, http.StatusBadRequest, "jobId is required")
return
}
var req struct {
Reason string `json:"reason"`
}
if err := decodeJSONBody(newJSONMaxReader(w, r), &req); err != nil && err != io.EOF {
writeJSONError(w, http.StatusBadRequest, "invalid request body: "+err.Error())
return
}
job, expired, err := s.ExpirePluginJob(jobID, req.Reason)
if err != nil {
if errors.Is(err, plugin.ErrJobNotFound) {
writeJSONError(w, http.StatusNotFound, err.Error())
return
}
writeJSONError(w, http.StatusInternalServerError, err.Error())
return
}
response := map[string]interface{}{
"job_id": jobID,
"expired": expired,
}
if job != nil {
response["job"] = job
}
if !expired {
response["message"] = "job is not active"
}
writeJSON(w, http.StatusOK, response)
}
// GetPluginActivitiesAPI returns recent plugin activities.
func (s *AdminServer) GetPluginActivitiesAPI(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query()

View File

@@ -1,11 +1,120 @@
package dash
import (
"encoding/json"
"errors"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"
"github.com/gorilla/mux"
"github.com/seaweedfs/seaweedfs/weed/admin/plugin"
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
)
func TestExpirePluginJobAPI(t *testing.T) {
makeRequest := func(adminServer *AdminServer, jobID string, body io.Reader) *httptest.ResponseRecorder {
req := httptest.NewRequest(http.MethodPost, "/api/plugin/jobs/"+jobID+"/expire", body)
req = mux.SetURLVars(req, map[string]string{"jobId": jobID})
recorder := httptest.NewRecorder()
adminServer.ExpirePluginJobAPI(recorder, req)
return recorder
}
t.Run("empty job id", func(t *testing.T) {
recorder := makeRequest(&AdminServer{}, "", nil)
if recorder.Code != http.StatusBadRequest {
t.Fatalf("expected 400, got %d", recorder.Code)
}
})
t.Run("invalid json", func(t *testing.T) {
recorder := makeRequest(&AdminServer{}, "job-id", strings.NewReader("{"))
if recorder.Code != http.StatusBadRequest {
t.Fatalf("expected 400, got %d", recorder.Code)
}
})
t.Run("job not found", func(t *testing.T) {
adminServer := &AdminServer{
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
return nil, false, plugin.ErrJobNotFound
},
}
recorder := makeRequest(adminServer, "missing", strings.NewReader(`{"reason":"nope"}`))
if recorder.Code != http.StatusNotFound {
t.Fatalf("expected 404, got %d", recorder.Code)
}
var payload map[string]any
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
t.Fatalf("failed to unmarshal body: %v", err)
}
if payload["error"] == nil {
t.Fatalf("expected error payload, got %v", payload)
}
})
t.Run("successful expire", func(t *testing.T) {
expected := &plugin.TrackedJob{JobID: "foo", State: "assigned"}
adminServer := &AdminServer{
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
if jobID != "foo" {
return nil, false, errors.New("unexpected")
}
return expected, true, nil
},
}
recorder := makeRequest(adminServer, "foo", strings.NewReader(`{"reason":"cleanup"}`))
if recorder.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", recorder.Code)
}
var payload map[string]any
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
t.Fatalf("failed to decode payload: %v", err)
}
if payload["job_id"] != "foo" {
t.Fatalf("expected job_id foo, got %v", payload["job_id"])
}
if expired, ok := payload["expired"].(bool); !ok || !expired {
t.Fatalf("expected expired=true, got %v", payload["expired"])
}
jobData, ok := payload["job"].(map[string]any)
if !ok || jobData["job_id"] != "foo" {
t.Fatalf("expected job info with job_id, got %v", payload["job"])
}
})
t.Run("non-active job", func(t *testing.T) {
adminServer := &AdminServer{
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
return nil, false, nil
},
}
recorder := makeRequest(adminServer, "bar", strings.NewReader(`{"reason":"ignore"}`))
if recorder.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", recorder.Code)
}
var payload map[string]any
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
t.Fatalf("failed to decode payload: %v", err)
}
if payload["job_id"] != "bar" {
t.Fatalf("expected job_id bar, got %v", payload["job_id"])
}
if expired, ok := payload["expired"].(bool); !ok || expired {
t.Fatalf("expected expired=false, got %v", payload["expired"])
}
if payload["message"] != "job is not active" {
t.Fatalf("expected message job is not active, got %v", payload["message"])
}
if _, exists := payload["job"]; exists {
t.Fatalf("expected no job payload for non-active job, got %v", payload["job"])
}
})
}
func TestBuildJobSpecFromProposalDoesNotReuseProposalID(t *testing.T) {
t.Parallel()