Files
seaweedFS/weed/admin/maintenance/maintenance_queue_test.go
Chris Lu 076d504044 fix(admin): reduce memory usage and verbose logging for large clusters (#8927)
* fix(admin): reduce memory usage and verbose logging for large clusters (#8919)

The admin server used excessive memory and produced thousands of log lines
on clusters with many volumes (e.g., 33k volumes). Three root causes:

1. Scanner duplicated all volume metrics: getVolumeHealthMetrics() created
   VolumeHealthMetrics objects, then convertToTaskMetrics() copied them all
   into identical types.VolumeHealthMetrics. Now uses the task-system type
   directly, eliminating the duplicate allocation and removing convertToTaskMetrics.

2. All previous task states loaded at startup: LoadTasksFromPersistence read
   and deserialized every .pb file from disk, logging each one. With thousands
   of balance tasks persisted, this caused massive startup I/O, memory usage,
   and log noise (including unguarded DEBUG glog.Infof per task). Now starts
   with an empty queue — the scanner re-detects current needs from live cluster
   state. Terminal tasks are purged from memory and disk when new scan results
   arrive.

3. Verbose per-volume/per-node logging: V(2) and V(3) logs produced thousands
   of lines per scan. Per-volume logs bumped to V(4), per-node/rack/disk logs
   bumped to V(3). Topology summary now logs counts instead of full node ID arrays.

Also removes lastTopologyInfo field from MaintenanceScanner — the raw protobuf
topology is returned as a local value and not retained between 30-minute scans.

* fix(admin): delete stale task files at startup, add DeleteAllTaskStates

Old task .pb files from previous runs were left on disk. The periodic
CleanupCompletedTasks still loads all files to find completed ones —
the same expensive 4GB path from the pprof profile.

Now at startup, DeleteAllTaskStates removes all .pb files by scanning
the directory without reading or deserializing them. The scanner will
re-detect any tasks still needed from live cluster state.

* fix(admin): don't persist terminal tasks to disk

CompleteTask was saving failed/completed tasks to disk where they'd
accumulate. The periodic cleanup only triggered for completed tasks,
not failed ones. Now terminal tasks are deleted from disk immediately
and only kept in memory for the current session's UI.

* fix(admin): cap in-memory tasks to 100 per job type

Without a limit, the task map grows unbounded — balance could create
thousands of pending tasks for a cluster with many imbalanced volumes.
Now AddTask rejects new tasks when a job type already has 100 in the
queue. The scanner will re-detect skipped volumes on the next scan.

* fix(admin): address PR review - memory-only purge, active-only capacity

- purgeTerminalTasks now only cleans in-memory map (terminal tasks are
  already deleted from disk by CompleteTask)
- Per-type capacity limit counts only active tasks (pending/assigned/
  in_progress), not terminal ones
- When at capacity, purge terminal tasks first before rejecting

* fix(admin): fix orphaned comment, add TaskStatusCancelled to terminal switch

- Move hasQueuedOrActiveTaskForVolume comment to its function definition
- Add TaskStatusCancelled to the terminal state switch in CompleteTask
  so cancelled task files are deleted from disk
2026-04-04 18:45:57 -07:00

1093 lines
31 KiB
Go

package maintenance
import (
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
)
// Test suite for canScheduleTaskNow() function and related scheduling logic
//
// This test suite ensures that:
// 1. The fallback scheduling logic works correctly when no integration is present
// 2. Task concurrency limits are properly enforced per task type
// 3. Different task types don't interfere with each other's concurrency limits
// 4. Custom policies with higher concurrency limits work correctly
// 5. Edge cases (nil tasks, empty task types) are handled gracefully
// 6. Helper functions (GetRunningTaskCount, canExecuteTaskType, etc.) work correctly
//
// Background: The canScheduleTaskNow() function is critical for task assignment.
// It was previously failing due to an overly restrictive integration scheduler,
// so we implemented a temporary fix that bypasses the integration and uses
// fallback logic based on simple concurrency limits per task type.
func TestCanScheduleTaskNow_FallbackLogic(t *testing.T) {
// Test the current implementation which uses fallback logic
mq := &MaintenanceQueue{
tasks: make(map[string]*MaintenanceTask),
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: nil, // No policy for default behavior
integration: nil, // No integration to force fallback
}
task := &MaintenanceTask{
ID: "test-task-1",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusPending,
}
// Should return true with fallback logic (no running tasks, default max concurrent = 1)
result := mq.canScheduleTaskNow(task)
if !result {
t.Errorf("Expected canScheduleTaskNow to return true with fallback logic, got false")
}
}
func TestCanScheduleTaskNow_FallbackWithRunningTasks(t *testing.T) {
// Test fallback logic when there are already running tasks
mq := &MaintenanceQueue{
tasks: map[string]*MaintenanceTask{
"running-task": {
ID: "running-task",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusInProgress,
},
},
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: nil,
integration: nil,
}
task := &MaintenanceTask{
ID: "test-task-2",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusPending,
}
// Should return false because max concurrent is 1 and we have 1 running task
result := mq.canScheduleTaskNow(task)
if result {
t.Errorf("Expected canScheduleTaskNow to return false when at capacity, got true")
}
}
func TestCanScheduleTaskNow_DifferentTaskTypes(t *testing.T) {
// Test that different task types don't interfere with each other
mq := &MaintenanceQueue{
tasks: map[string]*MaintenanceTask{
"running-ec-task": {
ID: "running-ec-task",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusInProgress,
},
},
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: nil,
integration: nil,
}
// Test vacuum task when EC task is running
vacuumTask := &MaintenanceTask{
ID: "vacuum-task",
Type: MaintenanceTaskType("vacuum"),
Status: TaskStatusPending,
}
// Should return true because vacuum and erasure_coding are different task types
result := mq.canScheduleTaskNow(vacuumTask)
if !result {
t.Errorf("Expected canScheduleTaskNow to return true for different task type, got false")
}
// Test another EC task when one is already running
ecTask := &MaintenanceTask{
ID: "ec-task",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusPending,
}
// Should return false because max concurrent for EC is 1 and we have 1 running
result = mq.canScheduleTaskNow(ecTask)
if result {
t.Errorf("Expected canScheduleTaskNow to return false for same task type at capacity, got true")
}
}
func TestCanScheduleTaskNow_WithIntegration(t *testing.T) {
// Test with a real MaintenanceIntegration (will use fallback logic in current implementation)
policy := &MaintenancePolicy{
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
GlobalMaxConcurrent: 10,
DefaultRepeatIntervalSeconds: 24 * 60 * 60, // 24 hours in seconds
DefaultCheckIntervalSeconds: 60 * 60, // 1 hour in seconds
}
mq := NewMaintenanceQueue(policy)
// Create a basic integration (this would normally be more complex)
integration := NewMaintenanceIntegration(mq, policy)
mq.SetIntegration(integration)
task := &MaintenanceTask{
ID: "test-task-3",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusPending,
}
// With our current implementation (fallback logic), this should return true
result := mq.canScheduleTaskNow(task)
if !result {
t.Errorf("Expected canScheduleTaskNow to return true with fallback logic, got false")
}
}
func TestGetRunningTaskCount(t *testing.T) {
// Test the helper function used by fallback logic
mq := &MaintenanceQueue{
tasks: map[string]*MaintenanceTask{
"task1": {
ID: "task1",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusInProgress,
},
"task2": {
ID: "task2",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusAssigned,
},
"task3": {
ID: "task3",
Type: MaintenanceTaskType("vacuum"),
Status: TaskStatusInProgress,
},
"task4": {
ID: "task4",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusCompleted,
},
},
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
}
// Should count 2 running EC tasks (in_progress + assigned)
ecCount := mq.GetRunningTaskCount(MaintenanceTaskType("erasure_coding"))
if ecCount != 2 {
t.Errorf("Expected 2 running EC tasks, got %d", ecCount)
}
// Should count 1 running vacuum task
vacuumCount := mq.GetRunningTaskCount(MaintenanceTaskType("vacuum"))
if vacuumCount != 1 {
t.Errorf("Expected 1 running vacuum task, got %d", vacuumCount)
}
// Should count 0 running balance tasks
balanceCount := mq.GetRunningTaskCount(MaintenanceTaskType("balance"))
if balanceCount != 0 {
t.Errorf("Expected 0 running balance tasks, got %d", balanceCount)
}
}
func TestCanExecuteTaskType(t *testing.T) {
// Test the fallback logic helper function
mq := &MaintenanceQueue{
tasks: map[string]*MaintenanceTask{
"running-task": {
ID: "running-task",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusInProgress,
},
},
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: nil, // Will use default max concurrent = 1
integration: nil,
}
// Should return false for EC (1 running, max = 1)
result := mq.canExecuteTaskType(MaintenanceTaskType("erasure_coding"))
if result {
t.Errorf("Expected canExecuteTaskType to return false for EC at capacity, got true")
}
// Should return true for vacuum (0 running, max = 1)
result = mq.canExecuteTaskType(MaintenanceTaskType("vacuum"))
if !result {
t.Errorf("Expected canExecuteTaskType to return true for vacuum, got false")
}
}
func TestGetMaxConcurrentForTaskType_DefaultBehavior(t *testing.T) {
// Test the default behavior when no policy or integration is set
mq := &MaintenanceQueue{
tasks: make(map[string]*MaintenanceTask),
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: nil,
integration: nil,
}
// Should return default value of 1
maxConcurrent := mq.getMaxConcurrentForTaskType(MaintenanceTaskType("erasure_coding"))
if maxConcurrent != 1 {
t.Errorf("Expected default max concurrent to be 1, got %d", maxConcurrent)
}
maxConcurrent = mq.getMaxConcurrentForTaskType(MaintenanceTaskType("vacuum"))
if maxConcurrent != 1 {
t.Errorf("Expected default max concurrent to be 1, got %d", maxConcurrent)
}
}
// Test edge cases and error conditions
func TestCanScheduleTaskNow_NilTask(t *testing.T) {
mq := &MaintenanceQueue{
tasks: make(map[string]*MaintenanceTask),
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: nil,
integration: nil,
}
// This should panic with a nil task, so we expect and catch the panic
defer func() {
if r := recover(); r == nil {
t.Errorf("Expected canScheduleTaskNow to panic with nil task, but it didn't")
}
}()
// This should panic
mq.canScheduleTaskNow(nil)
}
func TestCanScheduleTaskNow_EmptyTaskType(t *testing.T) {
mq := &MaintenanceQueue{
tasks: make(map[string]*MaintenanceTask),
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: nil,
integration: nil,
}
task := &MaintenanceTask{
ID: "empty-type-task",
Type: MaintenanceTaskType(""), // Empty task type
Status: TaskStatusPending,
}
// Should handle empty task type gracefully
result := mq.canScheduleTaskNow(task)
if !result {
t.Errorf("Expected canScheduleTaskNow to handle empty task type, got false")
}
}
func TestCanScheduleTaskNow_WithPolicy(t *testing.T) {
// Test with a policy that allows higher concurrency
policy := &MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
string(MaintenanceTaskType("erasure_coding")): {
Enabled: true,
MaxConcurrent: 3,
RepeatIntervalSeconds: 60 * 60, // 1 hour
CheckIntervalSeconds: 60 * 60, // 1 hour
},
string(MaintenanceTaskType("vacuum")): {
Enabled: true,
MaxConcurrent: 2,
RepeatIntervalSeconds: 60 * 60, // 1 hour
CheckIntervalSeconds: 60 * 60, // 1 hour
},
},
GlobalMaxConcurrent: 10,
DefaultRepeatIntervalSeconds: 24 * 60 * 60, // 24 hours in seconds
DefaultCheckIntervalSeconds: 60 * 60, // 1 hour in seconds
}
mq := &MaintenanceQueue{
tasks: map[string]*MaintenanceTask{
"running-task-1": {
ID: "running-task-1",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusInProgress,
},
"running-task-2": {
ID: "running-task-2",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusAssigned,
},
},
pendingTasks: []*MaintenanceTask{},
workers: make(map[string]*MaintenanceWorker),
policy: policy,
integration: nil,
}
task := &MaintenanceTask{
ID: "test-task-policy",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusPending,
}
// Should return true because we have 2 running EC tasks but max is 3
result := mq.canScheduleTaskNow(task)
if !result {
t.Errorf("Expected canScheduleTaskNow to return true with policy allowing 3 concurrent, got false")
}
// Add one more running task to reach the limit
mq.tasks["running-task-3"] = &MaintenanceTask{
ID: "running-task-3",
Type: MaintenanceTaskType("erasure_coding"),
Status: TaskStatusInProgress,
}
// Should return false because we now have 3 running EC tasks (at limit)
result = mq.canScheduleTaskNow(task)
if result {
t.Errorf("Expected canScheduleTaskNow to return false when at policy limit, got true")
}
}
func TestMaintenanceQueue_TaskIDPreservation(t *testing.T) {
// Setup Policy
policy := &MaintenancePolicy{
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
GlobalMaxConcurrent: 10,
}
// Setup Queue and Integration
mq := NewMaintenanceQueue(policy)
// We handle the integration manually to avoid complex setup
// integration := NewMaintenanceIntegration(mq, policy)
// mq.SetIntegration(integration)
// 2. Verify ID Preservation in AddTasksFromResults
originalID := "ec_task_123"
results := []*TaskDetectionResult{
{
TaskID: originalID,
TaskType: MaintenanceTaskType("erasure_coding"),
VolumeID: 100,
Server: "server1",
Priority: PriorityNormal,
TypedParams: &worker_pb.TaskParams{},
},
}
mq.AddTasksFromResults(results)
// Verify task exists with correct ID
queuedTask, exists := mq.tasks[originalID]
if !exists {
t.Errorf("Task with original ID %s not found in queue", originalID)
} else {
if queuedTask.ID != originalID {
t.Errorf("Task ID mismatch: expected %s, got %s", originalID, queuedTask.ID)
}
}
// 3. Verify AddTask preserves ID
manualTask := &MaintenanceTask{
ID: "manual_id_456",
Type: MaintenanceTaskType("vacuum"),
Status: TaskStatusPending,
}
mq.AddTask(manualTask)
if manualTask.ID != "manual_id_456" {
t.Errorf("AddTask overwrote ID: expected manual_id_456, got %s", manualTask.ID)
}
}
func TestMaintenanceQueue_ActiveTopologySync(t *testing.T) {
// Setup Policy
policy := &MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 1},
},
GlobalMaxConcurrent: 10,
}
// Setup Queue and Integration
mq := NewMaintenanceQueue(policy)
integration := NewMaintenanceIntegration(mq, policy)
mq.SetIntegration(integration)
// 4. Verify ActiveTopology Synchronization (Assign and Complete)
// Get and Setup Topology
at := integration.GetActiveTopology()
if at == nil {
t.Fatalf("ActiveTopology not found in integration")
}
topologyInfo := &master_pb.TopologyInfo{
DataCenterInfos: []*master_pb.DataCenterInfo{
{
Id: "dc1",
RackInfos: []*master_pb.RackInfo{
{
Id: "rack1",
DataNodeInfos: []*master_pb.DataNodeInfo{
{
Id: "server1",
DiskInfos: map[string]*master_pb.DiskInfo{
"hdd": {
DiskId: 1,
VolumeCount: 1,
MaxVolumeCount: 10,
VolumeInfos: []*master_pb.VolumeInformationMessage{
{Id: 100, Collection: "col1"},
},
},
"hdd2": {
DiskId: 2,
VolumeCount: 0,
MaxVolumeCount: 10,
},
},
},
},
},
},
},
},
}
at.UpdateTopology(topologyInfo)
// Add pending task to ActiveTopology
taskID := "sync_test_123"
err := at.AddPendingTask(topology.TaskSpec{
TaskID: taskID,
TaskType: topology.TaskTypeBalance,
VolumeID: 100,
VolumeSize: 1024 * 1024,
Sources: []topology.TaskSourceSpec{
{ServerID: "server1", DiskID: 1},
},
Destinations: []topology.TaskDestinationSpec{
{ServerID: "server1", DiskID: 2},
},
})
if err != nil {
t.Fatalf("Failed to add pending task to ActiveTopology: %v", err)
}
// Add the same task to MaintenanceQueue
mq.AddTask(&MaintenanceTask{
ID: taskID,
Type: MaintenanceTaskType("balance"),
VolumeID: 100,
Server: "server1",
Collection: "col1",
TypedParams: &worker_pb.TaskParams{
TaskId: taskID,
Targets: []*worker_pb.TaskTarget{
{Node: "server1", DiskId: 2},
},
},
})
// Check initial available capacity on destination disk (server1:2)
// server1:2 has MaxVolumeCount=10, VolumeCount=0.
// Capacity should be 9 because AddPendingTask already reserved 1 slot.
capacityBefore := at.GetEffectiveAvailableCapacity("server1", 2)
if capacityBefore != 9 {
t.Errorf("Expected capacity 9 after AddPendingTask, got %d", capacityBefore)
}
// 5. Verify AssignTask (via GetNextTask)
mq.workers["worker1"] = &MaintenanceWorker{
ID: "worker1",
Status: "active",
Capabilities: []MaintenanceTaskType{"balance"},
MaxConcurrent: 10,
}
taskFound := mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
if taskFound == nil || taskFound.ID != taskID {
t.Fatalf("Expected to get task %s, got %+v", taskID, taskFound)
}
// Capacity should still be 9 on destination disk (server1:2)
capacityAfterAssign := at.GetEffectiveAvailableCapacity("server1", 2)
if capacityAfterAssign != 9 {
t.Errorf("Capacity should still be 9 after assignment, got %d", capacityAfterAssign)
}
// 6. Verify CompleteTask
mq.CompleteTask(taskID, "")
// Capacity should be released back to 10
capacityAfterComplete := at.GetEffectiveAvailableCapacity("server1", 2)
if capacityAfterComplete != 10 {
t.Errorf("Capacity should have returned to 10 after completion, got %d", capacityAfterComplete)
}
}
func TestMaintenanceQueue_StaleWorkerCapacityRelease(t *testing.T) {
// Setup
policy := &MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 1},
},
}
mq := NewMaintenanceQueue(policy)
integration := NewMaintenanceIntegration(mq, policy)
mq.SetIntegration(integration)
at := integration.GetActiveTopology()
topologyInfo := &master_pb.TopologyInfo{
DataCenterInfos: []*master_pb.DataCenterInfo{
{
Id: "dc1",
RackInfos: []*master_pb.RackInfo{
{
Id: "rack1",
DataNodeInfos: []*master_pb.DataNodeInfo{
{
Id: "server1",
DiskInfos: map[string]*master_pb.DiskInfo{
"hdd1": {DiskId: 1, VolumeCount: 1, MaxVolumeCount: 10},
"hdd2": {DiskId: 2, VolumeCount: 0, MaxVolumeCount: 10},
},
},
},
},
},
},
},
}
at.UpdateTopology(topologyInfo)
taskID := "stale_test_123"
at.AddPendingTask(topology.TaskSpec{
TaskID: taskID,
TaskType: topology.TaskTypeBalance,
VolumeID: 100,
VolumeSize: 1024,
Sources: []topology.TaskSourceSpec{{ServerID: "server1", DiskID: 1}},
Destinations: []topology.TaskDestinationSpec{{ServerID: "server1", DiskID: 2}},
})
mq.AddTask(&MaintenanceTask{
ID: taskID,
Type: "balance",
VolumeID: 100,
Server: "server1",
TypedParams: &worker_pb.TaskParams{
TaskId: taskID,
Targets: []*worker_pb.TaskTarget{{Node: "server1", DiskId: 2}},
},
})
mq.workers["worker1"] = &MaintenanceWorker{
ID: "worker1",
Status: "active",
Capabilities: []MaintenanceTaskType{"balance"},
MaxConcurrent: 1,
LastHeartbeat: time.Now(),
}
// Assign task
mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
// Verify capacity reserved (9 left)
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
t.Errorf("Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
}
// Make worker stale
mq.workers["worker1"].LastHeartbeat = time.Now().Add(-1 * time.Hour)
// Remove stale workers
mq.RemoveStaleWorkers(10 * time.Minute)
// Verify capacity released (back to 10)
if at.GetEffectiveAvailableCapacity("server1", 2) != 10 {
t.Errorf("Expected capacity 10 after removing stale worker, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
}
}
func TestMaintenanceManager_CancelTaskCapacityRelease(t *testing.T) {
// Setup Manager
config := DefaultMaintenanceConfig()
mm := NewMaintenanceManager(nil, config)
integration := mm.scanner.integration
mq := mm.queue
at := integration.GetActiveTopology()
topologyInfo := &master_pb.TopologyInfo{
DataCenterInfos: []*master_pb.DataCenterInfo{
{
Id: "dc1",
RackInfos: []*master_pb.RackInfo{
{
Id: "rack1",
DataNodeInfos: []*master_pb.DataNodeInfo{
{
Id: "server1",
DiskInfos: map[string]*master_pb.DiskInfo{
"hdd1": {DiskId: 1, VolumeCount: 1, MaxVolumeCount: 10},
"hdd2": {DiskId: 2, VolumeCount: 0, MaxVolumeCount: 10},
},
},
},
},
},
},
},
}
at.UpdateTopology(topologyInfo)
taskID := "cancel_test_123"
// Note: AddPendingTask reserves capacity
at.AddPendingTask(topology.TaskSpec{
TaskID: taskID,
TaskType: topology.TaskTypeBalance,
VolumeID: 100,
VolumeSize: 1024,
Sources: []topology.TaskSourceSpec{{ServerID: "server1", DiskID: 1}},
Destinations: []topology.TaskDestinationSpec{{ServerID: "server1", DiskID: 2}},
})
mq.AddTask(&MaintenanceTask{
ID: taskID,
Type: "balance",
VolumeID: 100,
Server: "server1",
TypedParams: &worker_pb.TaskParams{
TaskId: taskID,
Targets: []*worker_pb.TaskTarget{{Node: "server1", DiskId: 2}},
},
})
// Verify capacity reserved (9 left)
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
t.Errorf("Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
}
// Cancel task
err := mm.CancelTask(taskID)
if err != nil {
t.Fatalf("Failed to cancel task: %v", err)
}
// Verify capacity released (back to 10)
if at.GetEffectiveAvailableCapacity("server1", 2) != 10 {
t.Errorf("Expected capacity 10 after cancelling task, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
}
}
type MockPersistence struct {
tasks []*MaintenanceTask
}
func (m *MockPersistence) SaveTaskState(task *MaintenanceTask) error { return nil }
func (m *MockPersistence) LoadTaskState(taskID string) (*MaintenanceTask, error) { return nil, nil }
func (m *MockPersistence) LoadAllTaskStates() ([]*MaintenanceTask, error) { return m.tasks, nil }
func (m *MockPersistence) DeleteTaskState(taskID string) error { return nil }
func (m *MockPersistence) DeleteAllTaskStates() error { return nil }
func (m *MockPersistence) CleanupCompletedTasks() error { return nil }
func (m *MockPersistence) SaveTaskPolicy(taskType string, policy *TaskPolicy) error { return nil }
func TestMaintenanceQueue_LoadTasksStartsEmpty(t *testing.T) {
// Setup
policy := &MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 1},
},
}
mq := NewMaintenanceQueue(policy)
// Setup mock persistence with tasks — these should NOT be loaded
mockTask := &MaintenanceTask{
ID: "old_task_123",
Type: "balance",
Status: TaskStatusPending,
}
mq.SetPersistence(&MockPersistence{tasks: []*MaintenanceTask{mockTask}})
// LoadTasksFromPersistence should be a no-op — scanner will re-detect
err := mq.LoadTasksFromPersistence()
if err != nil {
t.Fatalf("LoadTasksFromPersistence failed: %v", err)
}
// Queue should be empty — tasks will be re-detected by scanner
stats := mq.GetStats()
if stats.TotalTasks != 0 {
t.Errorf("Expected 0 tasks after startup, got %d", stats.TotalTasks)
}
}
func TestMaintenanceQueue_RetryCapacitySync(t *testing.T) {
// Setup
policy := &MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 1},
},
}
mq := NewMaintenanceQueue(policy)
integration := NewMaintenanceIntegration(mq, policy)
mq.SetIntegration(integration)
at := integration.GetActiveTopology()
topologyInfo := &master_pb.TopologyInfo{
DataCenterInfos: []*master_pb.DataCenterInfo{
{
Id: "dc1",
RackInfos: []*master_pb.RackInfo{
{
Id: "rack1",
DataNodeInfos: []*master_pb.DataNodeInfo{
{
Id: "server1",
DiskInfos: map[string]*master_pb.DiskInfo{
"hdd1": {DiskId: 1, VolumeCount: 1, MaxVolumeCount: 10},
"hdd2": {DiskId: 2, VolumeCount: 0, MaxVolumeCount: 10},
},
},
},
},
},
},
},
}
at.UpdateTopology(topologyInfo)
taskID := "retry_test_123"
// 1. Add task
at.AddPendingTask(topology.TaskSpec{
TaskID: taskID,
TaskType: topology.TaskTypeBalance,
VolumeID: 100,
VolumeSize: 1024,
Sources: []topology.TaskSourceSpec{{ServerID: "server1", DiskID: 1}},
Destinations: []topology.TaskDestinationSpec{{ServerID: "server1", DiskID: 2}},
})
mq.AddTask(&MaintenanceTask{
ID: taskID,
Type: "balance",
VolumeID: 100,
Server: "server1",
MaxRetries: 3,
TypedParams: &worker_pb.TaskParams{
TaskId: taskID,
Sources: []*worker_pb.TaskSource{{Node: "server1", DiskId: 1}},
Targets: []*worker_pb.TaskTarget{{Node: "server1", DiskId: 2}},
},
})
mq.workers["worker1"] = &MaintenanceWorker{
ID: "worker1",
Status: "active",
Capabilities: []MaintenanceTaskType{"balance"},
MaxConcurrent: 1,
LastHeartbeat: time.Now(),
}
// 2. Assign task
mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
// Verify capacity reserved (9 left)
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
t.Errorf("Initial assignment: Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
}
// 3. Complete with error (trigger retry)
mq.CompleteTask(taskID, "simulated failure")
// 4. Verify state after failure
task := mq.tasks[taskID]
if task.Status != TaskStatusPending {
t.Errorf("Expected status pending for retry, got %v", task.Status)
}
if task.RetryCount != 1 {
t.Errorf("Expected retry count 1, got %d", task.RetryCount)
}
// 5. Verify capacity in ActiveTopology
// It should first release (back to 10) and then re-reserve (SyncTask) because it's pending again.
// So it should still be 9.
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
t.Errorf("After retry sync: Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
}
}
func TestMaintenanceQueue_AssignTaskRollback(t *testing.T) {
// Setup Policy
policy := &MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 1},
},
GlobalMaxConcurrent: 10,
}
// Setup Queue and Integration
mq := NewMaintenanceQueue(policy)
integration := NewMaintenanceIntegration(mq, policy)
mq.SetIntegration(integration)
// Get Topology
at := integration.GetActiveTopology()
topologyInfo := &master_pb.TopologyInfo{
DataCenterInfos: []*master_pb.DataCenterInfo{
{
Id: "dc1",
RackInfos: []*master_pb.RackInfo{
{
Id: "rack1",
DataNodeInfos: []*master_pb.DataNodeInfo{
{
Id: "server1",
DiskInfos: map[string]*master_pb.DiskInfo{
"hdd": {
DiskId: 1,
VolumeCount: 1,
MaxVolumeCount: 1, // Only 1 slot
VolumeInfos: []*master_pb.VolumeInformationMessage{
{Id: 100, Collection: "col1"},
},
},
"hdd2": {
DiskId: 2,
VolumeCount: 0,
MaxVolumeCount: 0, // NO CAPACITY for target
},
},
},
},
},
},
},
},
}
at.UpdateTopology(topologyInfo)
taskID := "rollback_test_123"
// 1. Add task to MaintenanceQueue ONLY
// It's not in ActiveTopology, so AssignTask will fail with "pending task not found"
mq.AddTask(&MaintenanceTask{
ID: taskID,
Type: MaintenanceTaskType("balance"),
VolumeID: 100,
Server: "server1",
Collection: "col1",
TypedParams: &worker_pb.TaskParams{
TaskId: taskID,
Targets: []*worker_pb.TaskTarget{
{Node: "server1", DiskId: 2},
},
},
})
// 2. Setup worker
mq.workers["worker1"] = &MaintenanceWorker{
ID: "worker1",
Status: "active",
Capabilities: []MaintenanceTaskType{"balance"},
MaxConcurrent: 10,
}
// 3. Try to get next task
taskFound := mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
// 4. Verify GetNextTask returned nil due to ActiveTopology.AssignTask failure
if taskFound != nil {
t.Errorf("Expected GetNextTask to return nil, got task %s", taskFound.ID)
}
// 5. Verify the task in MaintenanceQueue is rolled back to pending
mq.mutex.RLock()
task, exists := mq.tasks[taskID]
mq.mutex.RUnlock()
if !exists {
t.Fatalf("Task %s should still exist in MaintenanceQueue", taskID)
}
if task.Status != TaskStatusPending {
t.Errorf("Expected task status %v, got %v", TaskStatusPending, task.Status)
}
if task.WorkerID != "" {
t.Errorf("Expected task WorkerID to be empty, got %s", task.WorkerID)
}
if len(task.AssignmentHistory) != 0 {
t.Errorf("Expected assignment history to be empty, got %d records", len(task.AssignmentHistory))
}
// 6. Verify the task is still in pendingTasks slice
mq.mutex.RLock()
foundInPending := false
for _, pt := range mq.pendingTasks {
if pt.ID == taskID {
foundInPending = true
break
}
}
mq.mutex.RUnlock()
if !foundInPending {
t.Errorf("Task %s should still be in pendingTasks slice", taskID)
}
}
func TestGetNextTask_SkipsVolumeConflictsAcrossTypes(t *testing.T) {
policy := &MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 2},
"erasure_coding": {MaxConcurrent: 2},
"vacuum": {MaxConcurrent: 2},
},
}
mq := NewMaintenanceQueue(policy)
now := time.Now()
mq.AddTask(&MaintenanceTask{
ID: "t1",
Type: MaintenanceTaskType("balance"),
Priority: PriorityHigh,
VolumeID: 100,
Server: "server1",
ScheduledAt: now.Add(-3 * time.Second),
})
t2 := &MaintenanceTask{
ID: "t2",
Type: MaintenanceTaskType("erasure_coding"),
Priority: PriorityNormal,
VolumeID: 100,
Server: "server1",
Status: TaskStatusPending,
ScheduledAt: now.Add(-2 * time.Second),
}
mq.mutex.Lock()
mq.tasks[t2.ID] = t2
mq.pendingTasks = append(mq.pendingTasks, t2)
mq.mutex.Unlock()
mq.AddTask(&MaintenanceTask{
ID: "t3",
Type: MaintenanceTaskType("vacuum"),
Priority: PriorityNormal,
VolumeID: 200,
Server: "server1",
ScheduledAt: now.Add(-1 * time.Second),
})
mq.workers["worker1"] = &MaintenanceWorker{
ID: "worker1",
Status: "active",
Capabilities: []MaintenanceTaskType{"balance", "erasure_coding", "vacuum"},
MaxConcurrent: 2,
LastHeartbeat: time.Now(),
}
mq.workers["worker2"] = &MaintenanceWorker{
ID: "worker2",
Status: "active",
Capabilities: []MaintenanceTaskType{"balance", "erasure_coding", "vacuum"},
MaxConcurrent: 2,
LastHeartbeat: time.Now(),
}
task1 := mq.GetNextTask("worker1", mq.workers["worker1"].Capabilities)
if task1 == nil || task1.ID != "t1" {
t.Fatalf("Expected first assignment to be t1, got %+v", task1)
}
task2 := mq.GetNextTask("worker2", mq.workers["worker2"].Capabilities)
if task2 == nil {
t.Fatalf("Expected a second task to be assigned, got nil")
}
if task2.ID != "t3" {
t.Fatalf("Expected second assignment to skip volume 100 and pick t3, got %s", task2.ID)
}
if mq.tasks["t2"].Status != TaskStatusPending {
t.Fatalf("Expected t2 to remain pending due to volume conflict, got %s", mq.tasks["t2"].Status)
}
}
func TestAddTask_OnePendingTaskPerVolume(t *testing.T) {
mq := NewMaintenanceQueue(&MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 1},
"erasure_coding": {MaxConcurrent: 1},
},
})
mq.AddTask(&MaintenanceTask{
ID: "t1",
Type: MaintenanceTaskType("balance"),
VolumeID: 100,
Server: "server1",
})
mq.AddTask(&MaintenanceTask{
ID: "t2",
Type: MaintenanceTaskType("erasure_coding"),
VolumeID: 100,
Server: "server1",
})
mq.mutex.RLock()
defer mq.mutex.RUnlock()
if len(mq.tasks) != 1 {
t.Fatalf("Expected 1 task in queue, got %d", len(mq.tasks))
}
if len(mq.pendingTasks) != 1 {
t.Fatalf("Expected 1 pending task, got %d", len(mq.pendingTasks))
}
if _, exists := mq.tasks["t1"]; !exists {
t.Fatalf("Expected task t1 to be queued")
}
if _, exists := mq.tasks["t2"]; exists {
t.Fatalf("Did not expect task t2 to be queued due to pending volume")
}
}
func TestAddTask_RejectsWhenVolumeHasRunningTask(t *testing.T) {
mq := NewMaintenanceQueue(&MaintenancePolicy{
TaskPolicies: map[string]*worker_pb.TaskPolicy{
"balance": {MaxConcurrent: 1},
"erasure_coding": {MaxConcurrent: 1},
},
})
mq.AddTask(&MaintenanceTask{
ID: "t1",
Type: MaintenanceTaskType("balance"),
VolumeID: 100,
Server: "server1",
})
// Simulate assignment to make it active
mq.mutex.Lock()
mq.tasks["t1"].Status = TaskStatusInProgress
mq.mutex.Unlock()
mq.AddTask(&MaintenanceTask{
ID: "t2",
Type: MaintenanceTaskType("erasure_coding"),
VolumeID: 100,
Server: "server1",
})
mq.mutex.RLock()
defer mq.mutex.RUnlock()
if len(mq.tasks) != 1 {
t.Fatalf("Expected 1 task in queue, got %d", len(mq.tasks))
}
if _, exists := mq.tasks["t2"]; exists {
t.Fatalf("Did not expect task t2 to be queued due to active volume task")
}
}