Expire stuck plugin jobs (#8492)
* Add stale job expiry and expire API * Add expire job button * Add test hook and coverage for ExpirePluginJobAPI * Document scheduler filtering side effect and reuse helper * Restore job spec proposal test * Regenerate plugin template output --------- Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
@@ -98,6 +98,7 @@ type AdminServer struct {
|
||||
// Maintenance system
|
||||
maintenanceManager *maintenance.MaintenanceManager
|
||||
plugin *adminplugin.Plugin
|
||||
expireJobHandler func(jobID string, reason string) (*adminplugin.TrackedJob, bool, error)
|
||||
|
||||
// Topic retention purger
|
||||
topicRetentionPurger *TopicRetentionPurger
|
||||
@@ -1020,6 +1021,17 @@ func (s *AdminServer) GetPluginJobDetail(jobID string, activityLimit, relatedLim
|
||||
return s.plugin.BuildJobDetail(jobID, activityLimit, relatedLimit)
|
||||
}
|
||||
|
||||
// ExpirePluginJob marks an active plugin job as failed so it no longer blocks scheduling.
|
||||
func (s *AdminServer) ExpirePluginJob(jobID, reason string) (*adminplugin.TrackedJob, bool, error) {
|
||||
if handler := s.expireJobHandler; handler != nil {
|
||||
return handler(jobID, reason)
|
||||
}
|
||||
if s.plugin == nil {
|
||||
return nil, false, fmt.Errorf("plugin is not enabled")
|
||||
}
|
||||
return s.plugin.ExpireJob(jobID, reason)
|
||||
}
|
||||
|
||||
// ListPluginActivities returns plugin job activities for monitoring.
|
||||
func (s *AdminServer) ListPluginActivities(jobType string, limit int) []adminplugin.JobActivity {
|
||||
if s.plugin == nil {
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
@@ -130,6 +131,47 @@ func (s *AdminServer) GetPluginJobDetailAPI(w http.ResponseWriter, r *http.Reque
|
||||
writeJSON(w, http.StatusOK, detail)
|
||||
}
|
||||
|
||||
// ExpirePluginJobAPI marks a job as failed so it no longer blocks scheduling.
|
||||
func (s *AdminServer) ExpirePluginJobAPI(w http.ResponseWriter, r *http.Request) {
|
||||
jobID := strings.TrimSpace(mux.Vars(r)["jobId"])
|
||||
if jobID == "" {
|
||||
writeJSONError(w, http.StatusBadRequest, "jobId is required")
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
if err := decodeJSONBody(newJSONMaxReader(w, r), &req); err != nil && err != io.EOF {
|
||||
writeJSONError(w, http.StatusBadRequest, "invalid request body: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
job, expired, err := s.ExpirePluginJob(jobID, req.Reason)
|
||||
if err != nil {
|
||||
if errors.Is(err, plugin.ErrJobNotFound) {
|
||||
writeJSONError(w, http.StatusNotFound, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSONError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
response := map[string]interface{}{
|
||||
"job_id": jobID,
|
||||
"expired": expired,
|
||||
}
|
||||
if job != nil {
|
||||
response["job"] = job
|
||||
}
|
||||
if !expired {
|
||||
response["message"] = "job is not active"
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, response)
|
||||
}
|
||||
|
||||
// GetPluginActivitiesAPI returns recent plugin activities.
|
||||
func (s *AdminServer) GetPluginActivitiesAPI(w http.ResponseWriter, r *http.Request) {
|
||||
query := r.URL.Query()
|
||||
|
||||
@@ -1,11 +1,120 @@
|
||||
package dash
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/seaweedfs/seaweedfs/weed/admin/plugin"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||
)
|
||||
|
||||
func TestExpirePluginJobAPI(t *testing.T) {
|
||||
makeRequest := func(adminServer *AdminServer, jobID string, body io.Reader) *httptest.ResponseRecorder {
|
||||
req := httptest.NewRequest(http.MethodPost, "/api/plugin/jobs/"+jobID+"/expire", body)
|
||||
req = mux.SetURLVars(req, map[string]string{"jobId": jobID})
|
||||
recorder := httptest.NewRecorder()
|
||||
adminServer.ExpirePluginJobAPI(recorder, req)
|
||||
return recorder
|
||||
}
|
||||
|
||||
t.Run("empty job id", func(t *testing.T) {
|
||||
recorder := makeRequest(&AdminServer{}, "", nil)
|
||||
if recorder.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400, got %d", recorder.Code)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("invalid json", func(t *testing.T) {
|
||||
recorder := makeRequest(&AdminServer{}, "job-id", strings.NewReader("{"))
|
||||
if recorder.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400, got %d", recorder.Code)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("job not found", func(t *testing.T) {
|
||||
adminServer := &AdminServer{
|
||||
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
|
||||
return nil, false, plugin.ErrJobNotFound
|
||||
},
|
||||
}
|
||||
recorder := makeRequest(adminServer, "missing", strings.NewReader(`{"reason":"nope"}`))
|
||||
if recorder.Code != http.StatusNotFound {
|
||||
t.Fatalf("expected 404, got %d", recorder.Code)
|
||||
}
|
||||
var payload map[string]any
|
||||
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
|
||||
t.Fatalf("failed to unmarshal body: %v", err)
|
||||
}
|
||||
if payload["error"] == nil {
|
||||
t.Fatalf("expected error payload, got %v", payload)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("successful expire", func(t *testing.T) {
|
||||
expected := &plugin.TrackedJob{JobID: "foo", State: "assigned"}
|
||||
adminServer := &AdminServer{
|
||||
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
|
||||
if jobID != "foo" {
|
||||
return nil, false, errors.New("unexpected")
|
||||
}
|
||||
return expected, true, nil
|
||||
},
|
||||
}
|
||||
recorder := makeRequest(adminServer, "foo", strings.NewReader(`{"reason":"cleanup"}`))
|
||||
if recorder.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d", recorder.Code)
|
||||
}
|
||||
var payload map[string]any
|
||||
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
|
||||
t.Fatalf("failed to decode payload: %v", err)
|
||||
}
|
||||
if payload["job_id"] != "foo" {
|
||||
t.Fatalf("expected job_id foo, got %v", payload["job_id"])
|
||||
}
|
||||
if expired, ok := payload["expired"].(bool); !ok || !expired {
|
||||
t.Fatalf("expected expired=true, got %v", payload["expired"])
|
||||
}
|
||||
jobData, ok := payload["job"].(map[string]any)
|
||||
if !ok || jobData["job_id"] != "foo" {
|
||||
t.Fatalf("expected job info with job_id, got %v", payload["job"])
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("non-active job", func(t *testing.T) {
|
||||
adminServer := &AdminServer{
|
||||
expireJobHandler: func(jobID, reason string) (*plugin.TrackedJob, bool, error) {
|
||||
return nil, false, nil
|
||||
},
|
||||
}
|
||||
recorder := makeRequest(adminServer, "bar", strings.NewReader(`{"reason":"ignore"}`))
|
||||
if recorder.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d", recorder.Code)
|
||||
}
|
||||
var payload map[string]any
|
||||
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
|
||||
t.Fatalf("failed to decode payload: %v", err)
|
||||
}
|
||||
if payload["job_id"] != "bar" {
|
||||
t.Fatalf("expected job_id bar, got %v", payload["job_id"])
|
||||
}
|
||||
if expired, ok := payload["expired"].(bool); !ok || expired {
|
||||
t.Fatalf("expected expired=false, got %v", payload["expired"])
|
||||
}
|
||||
if payload["message"] != "job is not active" {
|
||||
t.Fatalf("expected message job is not active, got %v", payload["message"])
|
||||
}
|
||||
if _, exists := payload["job"]; exists {
|
||||
t.Fatalf("expected no job payload for non-active job, got %v", payload["job"])
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestBuildJobSpecFromProposalDoesNotReuseProposalID(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user