admin: fix capacity leak in maintenance system by preserving Task IDs (#8214)

* admin: fix capacity leak in maintenance system by preserving Task IDs Preserve the original TaskID generated during detection and sync task states (Assign/Complete/Retry) with ActiveTopology. This ensures that capacity reserved during task assignment is properly released when a task completes or fails, preventing 'need 9, have 0' capacity exhaustion. Fixes https://github.com/seaweedfs/seaweedfs/issues/8202 * Update weed/admin/maintenance/maintenance_queue.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update weed/admin/maintenance/maintenance_queue.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * test: rename ActiveTopologySync to TaskIDPreservation Rename the test case to more accurately reflect its scope, as suggested by the code review bot. * Add TestMaintenanceQueue_ActiveTopologySync to verify task state synchronization and capacity management * Implement task assignment rollback and add verification test * Enhance ActiveTopology.CompleteTask to support pending tasks * Populate storage impact in MaintenanceIntegration.SyncTask * Release capacity in RemoveStaleWorkers when worker becomes unavailable * Release capacity in MaintenanceManager.CancelTask when pending task is cancelled * Sync reloaded tasks with ActiveTopology in LoadTasksFromPersistence * Add verification tests for consistent capacity management lifecycle * Add TestMaintenanceQueue_RetryCapacitySync to verify capacity tracking during retries --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-04 20:39:34 -08:00
parent 2ecbae3611
commit 19c18d827a
6 changed files with 696 additions and 7 deletions
--- a/weed/admin/maintenance/maintenance_queue.go
+++ b/weed/admin/maintenance/maintenance_queue.go
@@ -90,6 +90,11 @@ func (mq *MaintenanceQueue) LoadTasksFromPersistence() error {
 				}
 			}
 		}
+
+		// Sync task with ActiveTopology for capacity tracking
+		if mq.integration != nil {
+			mq.integration.SyncTask(task)
+		}
 	}

 	// Sort pending tasks by priority and schedule time
@@ -134,7 +139,9 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
 		return
 	}

-	task.ID = generateTaskID()
+	if task.ID == "" {
+		task.ID = generateTaskID()
+	}
 	task.Status = TaskStatusPending
 	task.CreatedAt = time.Now()
 	task.MaxRetries = 3 // Default retry count
@@ -200,6 +207,7 @@ func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult)
 		}

 		task := &MaintenanceTask{
+			ID:         result.TaskID,
 			Type:       result.TaskType,
 			Priority:   result.Priority,
 			VolumeID:   result.VolumeID,
@@ -311,6 +319,24 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
 	selectedTask.WorkerID = workerID
 	selectedTask.StartedAt = &now

+	// Notify ActiveTopology to reserve capacity (move from pending to assigned)
+	if mq.integration != nil {
+		if at := mq.integration.GetActiveTopology(); at != nil {
+			if err := at.AssignTask(selectedTask.ID); err != nil {
+				glog.Warningf("Failed to update ActiveTopology for task assignment %s: %v. Rolling back assignment.", selectedTask.ID, err)
+				// Rollback assignment in MaintenanceQueue
+				selectedTask.Status = TaskStatusPending
+				selectedTask.WorkerID = ""
+				selectedTask.StartedAt = nil
+				if len(selectedTask.AssignmentHistory) > 0 {
+					selectedTask.AssignmentHistory = selectedTask.AssignmentHistory[:len(selectedTask.AssignmentHistory)-1]
+				}
+				// Return nil so the task is not removed from pendingTasks and not returned to the worker
+				return nil
+			}
+		}
+	}
+
 	// Remove from pending tasks
 	mq.pendingTasks = append(mq.pendingTasks[:selectedIndex], mq.pendingTasks[selectedIndex+1:]...)

@@ -342,6 +368,17 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
 		return
 	}

+	// Notify ActiveTopology to release capacity (move from assigned to recent)
+	// We do this for both success and failure cases to release the capacity
+	if mq.integration != nil {
+		if at := mq.integration.GetActiveTopology(); at != nil {
+			if task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress {
+				// Ignore error as task might not be in ActiveTopology (e.g. after restart)
+				_ = at.CompleteTask(taskID)
+			}
+		}
+	}
+
 	completedTime := time.Now()
 	task.CompletedAt = &completedTime

@@ -377,6 +414,12 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
 			task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay

 			mq.pendingTasks = append(mq.pendingTasks, task)
+
+			// Resync with ActiveTopology (re-add as pending)
+			if mq.integration != nil {
+				mq.integration.SyncTask(task)
+			}
+
 			// Save task state after retry setup
 			mq.saveTaskState(task)
 			glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s",
@@ -703,6 +746,13 @@ func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {
 					task.Error = "Worker became unavailable"
 					completedTime := time.Now()
 					task.CompletedAt = &completedTime
+
+					// Notify ActiveTopology to release capacity
+					if mq.integration != nil {
+						if at := mq.integration.GetActiveTopology(); at != nil {
+							_ = at.CompleteTask(task.ID)
+						}
+					}
 				}
 			}