admin: fix capacity leak in maintenance system by preserving Task IDs (#8214)

* admin: fix capacity leak in maintenance system by preserving Task IDs

Preserve the original TaskID generated during detection and sync task
states (Assign/Complete/Retry) with ActiveTopology. This ensures that
capacity reserved during task assignment is properly released when a
task completes or fails, preventing 'need 9, have 0' capacity exhaustion.

Fixes https://github.com/seaweedfs/seaweedfs/issues/8202

* Update weed/admin/maintenance/maintenance_queue.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* Update weed/admin/maintenance/maintenance_queue.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* test: rename ActiveTopologySync to TaskIDPreservation

Rename the test case to more accurately reflect its scope, as suggested
by the code review bot.

* Add TestMaintenanceQueue_ActiveTopologySync to verify task state synchronization and capacity management

* Implement task assignment rollback and add verification test

* Enhance ActiveTopology.CompleteTask to support pending tasks

* Populate storage impact in MaintenanceIntegration.SyncTask

* Release capacity in RemoveStaleWorkers when worker becomes unavailable

* Release capacity in MaintenanceManager.CancelTask when pending task is cancelled

* Sync reloaded tasks with ActiveTopology in LoadTasksFromPersistence

* Add verification tests for consistent capacity management lifecycle

* Add TestMaintenanceQueue_RetryCapacitySync to verify capacity tracking during retries

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Chris Lu
2026-02-04 20:39:34 -08:00
committed by GitHub
parent 2ecbae3611
commit 19c18d827a
6 changed files with 696 additions and 7 deletions

View File

@@ -90,6 +90,11 @@ func (mq *MaintenanceQueue) LoadTasksFromPersistence() error {
}
}
}
// Sync task with ActiveTopology for capacity tracking
if mq.integration != nil {
mq.integration.SyncTask(task)
}
}
// Sort pending tasks by priority and schedule time
@@ -134,7 +139,9 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
return
}
task.ID = generateTaskID()
if task.ID == "" {
task.ID = generateTaskID()
}
task.Status = TaskStatusPending
task.CreatedAt = time.Now()
task.MaxRetries = 3 // Default retry count
@@ -200,6 +207,7 @@ func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult)
}
task := &MaintenanceTask{
ID: result.TaskID,
Type: result.TaskType,
Priority: result.Priority,
VolumeID: result.VolumeID,
@@ -311,6 +319,24 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
selectedTask.WorkerID = workerID
selectedTask.StartedAt = &now
// Notify ActiveTopology to reserve capacity (move from pending to assigned)
if mq.integration != nil {
if at := mq.integration.GetActiveTopology(); at != nil {
if err := at.AssignTask(selectedTask.ID); err != nil {
glog.Warningf("Failed to update ActiveTopology for task assignment %s: %v. Rolling back assignment.", selectedTask.ID, err)
// Rollback assignment in MaintenanceQueue
selectedTask.Status = TaskStatusPending
selectedTask.WorkerID = ""
selectedTask.StartedAt = nil
if len(selectedTask.AssignmentHistory) > 0 {
selectedTask.AssignmentHistory = selectedTask.AssignmentHistory[:len(selectedTask.AssignmentHistory)-1]
}
// Return nil so the task is not removed from pendingTasks and not returned to the worker
return nil
}
}
}
// Remove from pending tasks
mq.pendingTasks = append(mq.pendingTasks[:selectedIndex], mq.pendingTasks[selectedIndex+1:]...)
@@ -342,6 +368,17 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
return
}
// Notify ActiveTopology to release capacity (move from assigned to recent)
// We do this for both success and failure cases to release the capacity
if mq.integration != nil {
if at := mq.integration.GetActiveTopology(); at != nil {
if task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress {
// Ignore error as task might not be in ActiveTopology (e.g. after restart)
_ = at.CompleteTask(taskID)
}
}
}
completedTime := time.Now()
task.CompletedAt = &completedTime
@@ -377,6 +414,12 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay
mq.pendingTasks = append(mq.pendingTasks, task)
// Resync with ActiveTopology (re-add as pending)
if mq.integration != nil {
mq.integration.SyncTask(task)
}
// Save task state after retry setup
mq.saveTaskState(task)
glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s",
@@ -703,6 +746,13 @@ func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {
task.Error = "Worker became unavailable"
completedTime := time.Now()
task.CompletedAt = &completedTime
// Notify ActiveTopology to release capacity
if mq.integration != nil {
if at := mq.integration.GetActiveTopology(); at != nil {
_ = at.CompleteTask(task.ID)
}
}
}
}