fix: resolve ServerAddress to NodeId in maintenance task sync (#8508)

* fix: maintenance task topology lookup, retry, and stale task cleanup 1. Strip gRPC port from ServerAddress in SyncTask using ToHttpAddress() so task targets match topology disk keys (NodeId format). 2. Skip capacity check when topology has no disks yet (startup race where tasks are loaded from persistence before first topology update). 3. Don't retry permanent errors like "volume not found" - these will never succeed on retry. 4. Cancel all pending tasks for each task type before re-detection, ensuring stale proposals from previous cycles are cleaned up. This prevents stale tasks from blocking new detection and from repeatedly failing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * logs Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * less lock scope Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-04 19:20:28 -08:00
parent 88e8342e44
commit c19f88eef1
4 changed files with 90 additions and 17 deletions
--- a/weed/admin/maintenance/maintenance_queue.go
+++ b/weed/admin/maintenance/maintenance_queue.go
@@ -4,6 +4,7 @@ import (
 	"crypto/rand"
 	"fmt"
 	"sort"
+	"strings"
 	"time"

 	"github.com/seaweedfs/seaweedfs/weed/glog"
@@ -230,6 +231,46 @@ func (mq *MaintenanceQueue) hasDuplicateTask(newTask *MaintenanceTask) bool {
 	return false
 }

+// CancelPendingTasksByType cancels all pending tasks of a given type.
+// This is called before each detection cycle to ensure stale proposals
+// from previous cycles are cleaned up before creating new ones.
+func (mq *MaintenanceQueue) CancelPendingTasksByType(taskType MaintenanceTaskType) int {
+	mq.mutex.Lock()
+
+	var remaining []*MaintenanceTask
+	var cancelledSnapshots []*MaintenanceTask
+	cancelled := 0
+	for _, task := range mq.pendingTasks {
+		if task.Type == taskType {
+			task.Status = TaskStatusCancelled
+			now := time.Now()
+			task.CompletedAt = &now
+			cancelled++
+			cancelledSnapshots = append(cancelledSnapshots, snapshotTask(task))
+			glog.V(1).Infof("Cancelled stale pending task %s (%s) for volume %d before re-detection",
+				task.ID, task.Type, task.VolumeID)
+
+			// Release capacity in ActiveTopology and remove pending operation
+			if mq.integration != nil {
+				if at := mq.integration.GetActiveTopology(); at != nil {
+					_ = at.CompleteTask(task.ID)
+				}
+			}
+			mq.removePendingOperation(task.ID)
+		} else {
+			remaining = append(remaining, task)
+		}
+	}
+	mq.pendingTasks = remaining
+	mq.mutex.Unlock()
+
+	// Persist cancelled state outside the lock to avoid blocking
+	for _, snapshot := range cancelledSnapshots {
+		mq.saveTaskState(snapshot)
+	}
+	return cancelled
+}
+
 // AddTasksFromResults converts detection results to tasks and adds them to the queue
 func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult) {
 	for _, result := range results {
@@ -455,8 +496,8 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
 		task.Status = TaskStatusFailed
 		task.Error = error

-		// Check if task should be retried
-		if task.RetryCount < task.MaxRetries {
+		// Check if task should be retried (skip retry for permanent errors)
+		if task.RetryCount < task.MaxRetries && !isNonRetriableError(error) {
 			// Record unassignment due to failure/retry
 			if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
 				lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
@@ -559,6 +600,12 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
 	}
 }

+// isNonRetriableError returns true for errors that will never succeed on retry,
+// such as when the volume doesn't exist on the source server.
+func isNonRetriableError(errMsg string) bool {
+	return strings.Contains(errMsg, "not found")
+}
+
 // UpdateTaskProgress updates the progress of a running task
 func (mq *MaintenanceQueue) UpdateTaskProgress(taskID string, progress float64) {
 	mq.mutex.Lock()