admin: fix capacity leak in maintenance system by preserving Task IDs (#8214)
* admin: fix capacity leak in maintenance system by preserving Task IDs Preserve the original TaskID generated during detection and sync task states (Assign/Complete/Retry) with ActiveTopology. This ensures that capacity reserved during task assignment is properly released when a task completes or fails, preventing 'need 9, have 0' capacity exhaustion. Fixes https://github.com/seaweedfs/seaweedfs/issues/8202 * Update weed/admin/maintenance/maintenance_queue.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update weed/admin/maintenance/maintenance_queue.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * test: rename ActiveTopologySync to TaskIDPreservation Rename the test case to more accurately reflect its scope, as suggested by the code review bot. * Add TestMaintenanceQueue_ActiveTopologySync to verify task state synchronization and capacity management * Implement task assignment rollback and add verification test * Enhance ActiveTopology.CompleteTask to support pending tasks * Populate storage impact in MaintenanceIntegration.SyncTask * Release capacity in RemoveStaleWorkers when worker becomes unavailable * Release capacity in MaintenanceManager.CancelTask when pending task is cancelled * Sync reloaded tasks with ActiveTopology in LoadTasksFromPersistence * Add verification tests for consistent capacity management lifecycle * Add TestMaintenanceQueue_RetryCapacitySync to verify capacity tracking during retries --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -90,6 +90,11 @@ func (mq *MaintenanceQueue) LoadTasksFromPersistence() error {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sync task with ActiveTopology for capacity tracking
|
||||
if mq.integration != nil {
|
||||
mq.integration.SyncTask(task)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort pending tasks by priority and schedule time
|
||||
@@ -134,7 +139,9 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
|
||||
return
|
||||
}
|
||||
|
||||
task.ID = generateTaskID()
|
||||
if task.ID == "" {
|
||||
task.ID = generateTaskID()
|
||||
}
|
||||
task.Status = TaskStatusPending
|
||||
task.CreatedAt = time.Now()
|
||||
task.MaxRetries = 3 // Default retry count
|
||||
@@ -200,6 +207,7 @@ func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult)
|
||||
}
|
||||
|
||||
task := &MaintenanceTask{
|
||||
ID: result.TaskID,
|
||||
Type: result.TaskType,
|
||||
Priority: result.Priority,
|
||||
VolumeID: result.VolumeID,
|
||||
@@ -311,6 +319,24 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
|
||||
selectedTask.WorkerID = workerID
|
||||
selectedTask.StartedAt = &now
|
||||
|
||||
// Notify ActiveTopology to reserve capacity (move from pending to assigned)
|
||||
if mq.integration != nil {
|
||||
if at := mq.integration.GetActiveTopology(); at != nil {
|
||||
if err := at.AssignTask(selectedTask.ID); err != nil {
|
||||
glog.Warningf("Failed to update ActiveTopology for task assignment %s: %v. Rolling back assignment.", selectedTask.ID, err)
|
||||
// Rollback assignment in MaintenanceQueue
|
||||
selectedTask.Status = TaskStatusPending
|
||||
selectedTask.WorkerID = ""
|
||||
selectedTask.StartedAt = nil
|
||||
if len(selectedTask.AssignmentHistory) > 0 {
|
||||
selectedTask.AssignmentHistory = selectedTask.AssignmentHistory[:len(selectedTask.AssignmentHistory)-1]
|
||||
}
|
||||
// Return nil so the task is not removed from pendingTasks and not returned to the worker
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove from pending tasks
|
||||
mq.pendingTasks = append(mq.pendingTasks[:selectedIndex], mq.pendingTasks[selectedIndex+1:]...)
|
||||
|
||||
@@ -342,6 +368,17 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
|
||||
return
|
||||
}
|
||||
|
||||
// Notify ActiveTopology to release capacity (move from assigned to recent)
|
||||
// We do this for both success and failure cases to release the capacity
|
||||
if mq.integration != nil {
|
||||
if at := mq.integration.GetActiveTopology(); at != nil {
|
||||
if task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress {
|
||||
// Ignore error as task might not be in ActiveTopology (e.g. after restart)
|
||||
_ = at.CompleteTask(taskID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
completedTime := time.Now()
|
||||
task.CompletedAt = &completedTime
|
||||
|
||||
@@ -377,6 +414,12 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
|
||||
task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay
|
||||
|
||||
mq.pendingTasks = append(mq.pendingTasks, task)
|
||||
|
||||
// Resync with ActiveTopology (re-add as pending)
|
||||
if mq.integration != nil {
|
||||
mq.integration.SyncTask(task)
|
||||
}
|
||||
|
||||
// Save task state after retry setup
|
||||
mq.saveTaskState(task)
|
||||
glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s",
|
||||
@@ -703,6 +746,13 @@ func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {
|
||||
task.Error = "Worker became unavailable"
|
||||
completedTime := time.Now()
|
||||
task.CompletedAt = &completedTime
|
||||
|
||||
// Notify ActiveTopology to release capacity
|
||||
if mq.integration != nil {
|
||||
if at := mq.integration.GetActiveTopology(); at != nil {
|
||||
_ = at.CompleteTask(task.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user