* fix(admin): reduce memory usage and verbose logging for large clusters (#8919) The admin server used excessive memory and produced thousands of log lines on clusters with many volumes (e.g., 33k volumes). Three root causes: 1. Scanner duplicated all volume metrics: getVolumeHealthMetrics() created VolumeHealthMetrics objects, then convertToTaskMetrics() copied them all into identical types.VolumeHealthMetrics. Now uses the task-system type directly, eliminating the duplicate allocation and removing convertToTaskMetrics. 2. All previous task states loaded at startup: LoadTasksFromPersistence read and deserialized every .pb file from disk, logging each one. With thousands of balance tasks persisted, this caused massive startup I/O, memory usage, and log noise (including unguarded DEBUG glog.Infof per task). Now starts with an empty queue — the scanner re-detects current needs from live cluster state. Terminal tasks are purged from memory and disk when new scan results arrive. 3. Verbose per-volume/per-node logging: V(2) and V(3) logs produced thousands of lines per scan. Per-volume logs bumped to V(4), per-node/rack/disk logs bumped to V(3). Topology summary now logs counts instead of full node ID arrays. Also removes lastTopologyInfo field from MaintenanceScanner — the raw protobuf topology is returned as a local value and not retained between 30-minute scans. * fix(admin): delete stale task files at startup, add DeleteAllTaskStates Old task .pb files from previous runs were left on disk. The periodic CleanupCompletedTasks still loads all files to find completed ones — the same expensive 4GB path from the pprof profile. Now at startup, DeleteAllTaskStates removes all .pb files by scanning the directory without reading or deserializing them. The scanner will re-detect any tasks still needed from live cluster state. * fix(admin): don't persist terminal tasks to disk CompleteTask was saving failed/completed tasks to disk where they'd accumulate. The periodic cleanup only triggered for completed tasks, not failed ones. Now terminal tasks are deleted from disk immediately and only kept in memory for the current session's UI. * fix(admin): cap in-memory tasks to 100 per job type Without a limit, the task map grows unbounded — balance could create thousands of pending tasks for a cluster with many imbalanced volumes. Now AddTask rejects new tasks when a job type already has 100 in the queue. The scanner will re-detect skipped volumes on the next scan. * fix(admin): address PR review - memory-only purge, active-only capacity - purgeTerminalTasks now only cleans in-memory map (terminal tasks are already deleted from disk by CompleteTask) - Per-type capacity limit counts only active tasks (pending/assigned/ in_progress), not terminal ones - When at capacity, purge terminal tasks first before rejecting * fix(admin): fix orphaned comment, add TaskStatusCancelled to terminal switch - Move hasQueuedOrActiveTaskForVolume comment to its function definition - Add TaskStatusCancelled to the terminal state switch in CompleteTask so cancelled task files are deleted from disk
1093 lines
31 KiB
Go
1093 lines
31 KiB
Go
package maintenance
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/admin/topology"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
|
)
|
|
|
|
// Test suite for canScheduleTaskNow() function and related scheduling logic
|
|
//
|
|
// This test suite ensures that:
|
|
// 1. The fallback scheduling logic works correctly when no integration is present
|
|
// 2. Task concurrency limits are properly enforced per task type
|
|
// 3. Different task types don't interfere with each other's concurrency limits
|
|
// 4. Custom policies with higher concurrency limits work correctly
|
|
// 5. Edge cases (nil tasks, empty task types) are handled gracefully
|
|
// 6. Helper functions (GetRunningTaskCount, canExecuteTaskType, etc.) work correctly
|
|
//
|
|
// Background: The canScheduleTaskNow() function is critical for task assignment.
|
|
// It was previously failing due to an overly restrictive integration scheduler,
|
|
// so we implemented a temporary fix that bypasses the integration and uses
|
|
// fallback logic based on simple concurrency limits per task type.
|
|
|
|
func TestCanScheduleTaskNow_FallbackLogic(t *testing.T) {
|
|
// Test the current implementation which uses fallback logic
|
|
mq := &MaintenanceQueue{
|
|
tasks: make(map[string]*MaintenanceTask),
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: nil, // No policy for default behavior
|
|
integration: nil, // No integration to force fallback
|
|
}
|
|
|
|
task := &MaintenanceTask{
|
|
ID: "test-task-1",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusPending,
|
|
}
|
|
|
|
// Should return true with fallback logic (no running tasks, default max concurrent = 1)
|
|
result := mq.canScheduleTaskNow(task)
|
|
if !result {
|
|
t.Errorf("Expected canScheduleTaskNow to return true with fallback logic, got false")
|
|
}
|
|
}
|
|
|
|
func TestCanScheduleTaskNow_FallbackWithRunningTasks(t *testing.T) {
|
|
// Test fallback logic when there are already running tasks
|
|
mq := &MaintenanceQueue{
|
|
tasks: map[string]*MaintenanceTask{
|
|
"running-task": {
|
|
ID: "running-task",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusInProgress,
|
|
},
|
|
},
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: nil,
|
|
integration: nil,
|
|
}
|
|
|
|
task := &MaintenanceTask{
|
|
ID: "test-task-2",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusPending,
|
|
}
|
|
|
|
// Should return false because max concurrent is 1 and we have 1 running task
|
|
result := mq.canScheduleTaskNow(task)
|
|
if result {
|
|
t.Errorf("Expected canScheduleTaskNow to return false when at capacity, got true")
|
|
}
|
|
}
|
|
|
|
func TestCanScheduleTaskNow_DifferentTaskTypes(t *testing.T) {
|
|
// Test that different task types don't interfere with each other
|
|
mq := &MaintenanceQueue{
|
|
tasks: map[string]*MaintenanceTask{
|
|
"running-ec-task": {
|
|
ID: "running-ec-task",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusInProgress,
|
|
},
|
|
},
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: nil,
|
|
integration: nil,
|
|
}
|
|
|
|
// Test vacuum task when EC task is running
|
|
vacuumTask := &MaintenanceTask{
|
|
ID: "vacuum-task",
|
|
Type: MaintenanceTaskType("vacuum"),
|
|
Status: TaskStatusPending,
|
|
}
|
|
|
|
// Should return true because vacuum and erasure_coding are different task types
|
|
result := mq.canScheduleTaskNow(vacuumTask)
|
|
if !result {
|
|
t.Errorf("Expected canScheduleTaskNow to return true for different task type, got false")
|
|
}
|
|
|
|
// Test another EC task when one is already running
|
|
ecTask := &MaintenanceTask{
|
|
ID: "ec-task",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusPending,
|
|
}
|
|
|
|
// Should return false because max concurrent for EC is 1 and we have 1 running
|
|
result = mq.canScheduleTaskNow(ecTask)
|
|
if result {
|
|
t.Errorf("Expected canScheduleTaskNow to return false for same task type at capacity, got true")
|
|
}
|
|
}
|
|
|
|
func TestCanScheduleTaskNow_WithIntegration(t *testing.T) {
|
|
// Test with a real MaintenanceIntegration (will use fallback logic in current implementation)
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
|
|
GlobalMaxConcurrent: 10,
|
|
DefaultRepeatIntervalSeconds: 24 * 60 * 60, // 24 hours in seconds
|
|
DefaultCheckIntervalSeconds: 60 * 60, // 1 hour in seconds
|
|
}
|
|
mq := NewMaintenanceQueue(policy)
|
|
|
|
// Create a basic integration (this would normally be more complex)
|
|
integration := NewMaintenanceIntegration(mq, policy)
|
|
mq.SetIntegration(integration)
|
|
|
|
task := &MaintenanceTask{
|
|
ID: "test-task-3",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusPending,
|
|
}
|
|
|
|
// With our current implementation (fallback logic), this should return true
|
|
result := mq.canScheduleTaskNow(task)
|
|
if !result {
|
|
t.Errorf("Expected canScheduleTaskNow to return true with fallback logic, got false")
|
|
}
|
|
}
|
|
|
|
func TestGetRunningTaskCount(t *testing.T) {
|
|
// Test the helper function used by fallback logic
|
|
mq := &MaintenanceQueue{
|
|
tasks: map[string]*MaintenanceTask{
|
|
"task1": {
|
|
ID: "task1",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusInProgress,
|
|
},
|
|
"task2": {
|
|
ID: "task2",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusAssigned,
|
|
},
|
|
"task3": {
|
|
ID: "task3",
|
|
Type: MaintenanceTaskType("vacuum"),
|
|
Status: TaskStatusInProgress,
|
|
},
|
|
"task4": {
|
|
ID: "task4",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusCompleted,
|
|
},
|
|
},
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
}
|
|
|
|
// Should count 2 running EC tasks (in_progress + assigned)
|
|
ecCount := mq.GetRunningTaskCount(MaintenanceTaskType("erasure_coding"))
|
|
if ecCount != 2 {
|
|
t.Errorf("Expected 2 running EC tasks, got %d", ecCount)
|
|
}
|
|
|
|
// Should count 1 running vacuum task
|
|
vacuumCount := mq.GetRunningTaskCount(MaintenanceTaskType("vacuum"))
|
|
if vacuumCount != 1 {
|
|
t.Errorf("Expected 1 running vacuum task, got %d", vacuumCount)
|
|
}
|
|
|
|
// Should count 0 running balance tasks
|
|
balanceCount := mq.GetRunningTaskCount(MaintenanceTaskType("balance"))
|
|
if balanceCount != 0 {
|
|
t.Errorf("Expected 0 running balance tasks, got %d", balanceCount)
|
|
}
|
|
}
|
|
|
|
func TestCanExecuteTaskType(t *testing.T) {
|
|
// Test the fallback logic helper function
|
|
mq := &MaintenanceQueue{
|
|
tasks: map[string]*MaintenanceTask{
|
|
"running-task": {
|
|
ID: "running-task",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusInProgress,
|
|
},
|
|
},
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: nil, // Will use default max concurrent = 1
|
|
integration: nil,
|
|
}
|
|
|
|
// Should return false for EC (1 running, max = 1)
|
|
result := mq.canExecuteTaskType(MaintenanceTaskType("erasure_coding"))
|
|
if result {
|
|
t.Errorf("Expected canExecuteTaskType to return false for EC at capacity, got true")
|
|
}
|
|
|
|
// Should return true for vacuum (0 running, max = 1)
|
|
result = mq.canExecuteTaskType(MaintenanceTaskType("vacuum"))
|
|
if !result {
|
|
t.Errorf("Expected canExecuteTaskType to return true for vacuum, got false")
|
|
}
|
|
}
|
|
|
|
func TestGetMaxConcurrentForTaskType_DefaultBehavior(t *testing.T) {
|
|
// Test the default behavior when no policy or integration is set
|
|
mq := &MaintenanceQueue{
|
|
tasks: make(map[string]*MaintenanceTask),
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: nil,
|
|
integration: nil,
|
|
}
|
|
|
|
// Should return default value of 1
|
|
maxConcurrent := mq.getMaxConcurrentForTaskType(MaintenanceTaskType("erasure_coding"))
|
|
if maxConcurrent != 1 {
|
|
t.Errorf("Expected default max concurrent to be 1, got %d", maxConcurrent)
|
|
}
|
|
|
|
maxConcurrent = mq.getMaxConcurrentForTaskType(MaintenanceTaskType("vacuum"))
|
|
if maxConcurrent != 1 {
|
|
t.Errorf("Expected default max concurrent to be 1, got %d", maxConcurrent)
|
|
}
|
|
}
|
|
|
|
// Test edge cases and error conditions
|
|
func TestCanScheduleTaskNow_NilTask(t *testing.T) {
|
|
mq := &MaintenanceQueue{
|
|
tasks: make(map[string]*MaintenanceTask),
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: nil,
|
|
integration: nil,
|
|
}
|
|
|
|
// This should panic with a nil task, so we expect and catch the panic
|
|
defer func() {
|
|
if r := recover(); r == nil {
|
|
t.Errorf("Expected canScheduleTaskNow to panic with nil task, but it didn't")
|
|
}
|
|
}()
|
|
|
|
// This should panic
|
|
mq.canScheduleTaskNow(nil)
|
|
}
|
|
|
|
func TestCanScheduleTaskNow_EmptyTaskType(t *testing.T) {
|
|
mq := &MaintenanceQueue{
|
|
tasks: make(map[string]*MaintenanceTask),
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: nil,
|
|
integration: nil,
|
|
}
|
|
|
|
task := &MaintenanceTask{
|
|
ID: "empty-type-task",
|
|
Type: MaintenanceTaskType(""), // Empty task type
|
|
Status: TaskStatusPending,
|
|
}
|
|
|
|
// Should handle empty task type gracefully
|
|
result := mq.canScheduleTaskNow(task)
|
|
if !result {
|
|
t.Errorf("Expected canScheduleTaskNow to handle empty task type, got false")
|
|
}
|
|
}
|
|
|
|
func TestCanScheduleTaskNow_WithPolicy(t *testing.T) {
|
|
// Test with a policy that allows higher concurrency
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
string(MaintenanceTaskType("erasure_coding")): {
|
|
Enabled: true,
|
|
MaxConcurrent: 3,
|
|
RepeatIntervalSeconds: 60 * 60, // 1 hour
|
|
CheckIntervalSeconds: 60 * 60, // 1 hour
|
|
},
|
|
string(MaintenanceTaskType("vacuum")): {
|
|
Enabled: true,
|
|
MaxConcurrent: 2,
|
|
RepeatIntervalSeconds: 60 * 60, // 1 hour
|
|
CheckIntervalSeconds: 60 * 60, // 1 hour
|
|
},
|
|
},
|
|
GlobalMaxConcurrent: 10,
|
|
DefaultRepeatIntervalSeconds: 24 * 60 * 60, // 24 hours in seconds
|
|
DefaultCheckIntervalSeconds: 60 * 60, // 1 hour in seconds
|
|
}
|
|
|
|
mq := &MaintenanceQueue{
|
|
tasks: map[string]*MaintenanceTask{
|
|
"running-task-1": {
|
|
ID: "running-task-1",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusInProgress,
|
|
},
|
|
"running-task-2": {
|
|
ID: "running-task-2",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusAssigned,
|
|
},
|
|
},
|
|
pendingTasks: []*MaintenanceTask{},
|
|
workers: make(map[string]*MaintenanceWorker),
|
|
policy: policy,
|
|
integration: nil,
|
|
}
|
|
|
|
task := &MaintenanceTask{
|
|
ID: "test-task-policy",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusPending,
|
|
}
|
|
|
|
// Should return true because we have 2 running EC tasks but max is 3
|
|
result := mq.canScheduleTaskNow(task)
|
|
if !result {
|
|
t.Errorf("Expected canScheduleTaskNow to return true with policy allowing 3 concurrent, got false")
|
|
}
|
|
|
|
// Add one more running task to reach the limit
|
|
mq.tasks["running-task-3"] = &MaintenanceTask{
|
|
ID: "running-task-3",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Status: TaskStatusInProgress,
|
|
}
|
|
|
|
// Should return false because we now have 3 running EC tasks (at limit)
|
|
result = mq.canScheduleTaskNow(task)
|
|
if result {
|
|
t.Errorf("Expected canScheduleTaskNow to return false when at policy limit, got true")
|
|
}
|
|
}
|
|
|
|
func TestMaintenanceQueue_TaskIDPreservation(t *testing.T) {
|
|
// Setup Policy
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
|
|
GlobalMaxConcurrent: 10,
|
|
}
|
|
|
|
// Setup Queue and Integration
|
|
mq := NewMaintenanceQueue(policy)
|
|
// We handle the integration manually to avoid complex setup
|
|
// integration := NewMaintenanceIntegration(mq, policy)
|
|
// mq.SetIntegration(integration)
|
|
|
|
// 2. Verify ID Preservation in AddTasksFromResults
|
|
originalID := "ec_task_123"
|
|
results := []*TaskDetectionResult{
|
|
{
|
|
TaskID: originalID,
|
|
TaskType: MaintenanceTaskType("erasure_coding"),
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
Priority: PriorityNormal,
|
|
TypedParams: &worker_pb.TaskParams{},
|
|
},
|
|
}
|
|
|
|
mq.AddTasksFromResults(results)
|
|
|
|
// Verify task exists with correct ID
|
|
queuedTask, exists := mq.tasks[originalID]
|
|
if !exists {
|
|
t.Errorf("Task with original ID %s not found in queue", originalID)
|
|
} else {
|
|
if queuedTask.ID != originalID {
|
|
t.Errorf("Task ID mismatch: expected %s, got %s", originalID, queuedTask.ID)
|
|
}
|
|
}
|
|
|
|
// 3. Verify AddTask preserves ID
|
|
manualTask := &MaintenanceTask{
|
|
ID: "manual_id_456",
|
|
Type: MaintenanceTaskType("vacuum"),
|
|
Status: TaskStatusPending,
|
|
}
|
|
mq.AddTask(manualTask)
|
|
|
|
if manualTask.ID != "manual_id_456" {
|
|
t.Errorf("AddTask overwrote ID: expected manual_id_456, got %s", manualTask.ID)
|
|
}
|
|
}
|
|
|
|
func TestMaintenanceQueue_ActiveTopologySync(t *testing.T) {
|
|
// Setup Policy
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 1},
|
|
},
|
|
GlobalMaxConcurrent: 10,
|
|
}
|
|
|
|
// Setup Queue and Integration
|
|
mq := NewMaintenanceQueue(policy)
|
|
integration := NewMaintenanceIntegration(mq, policy)
|
|
mq.SetIntegration(integration)
|
|
|
|
// 4. Verify ActiveTopology Synchronization (Assign and Complete)
|
|
// Get and Setup Topology
|
|
at := integration.GetActiveTopology()
|
|
if at == nil {
|
|
t.Fatalf("ActiveTopology not found in integration")
|
|
}
|
|
|
|
topologyInfo := &master_pb.TopologyInfo{
|
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
|
{
|
|
Id: "dc1",
|
|
RackInfos: []*master_pb.RackInfo{
|
|
{
|
|
Id: "rack1",
|
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
|
{
|
|
Id: "server1",
|
|
DiskInfos: map[string]*master_pb.DiskInfo{
|
|
"hdd": {
|
|
DiskId: 1,
|
|
VolumeCount: 1,
|
|
MaxVolumeCount: 10,
|
|
VolumeInfos: []*master_pb.VolumeInformationMessage{
|
|
{Id: 100, Collection: "col1"},
|
|
},
|
|
},
|
|
"hdd2": {
|
|
DiskId: 2,
|
|
VolumeCount: 0,
|
|
MaxVolumeCount: 10,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
at.UpdateTopology(topologyInfo)
|
|
|
|
// Add pending task to ActiveTopology
|
|
taskID := "sync_test_123"
|
|
err := at.AddPendingTask(topology.TaskSpec{
|
|
TaskID: taskID,
|
|
TaskType: topology.TaskTypeBalance,
|
|
VolumeID: 100,
|
|
VolumeSize: 1024 * 1024,
|
|
Sources: []topology.TaskSourceSpec{
|
|
{ServerID: "server1", DiskID: 1},
|
|
},
|
|
Destinations: []topology.TaskDestinationSpec{
|
|
{ServerID: "server1", DiskID: 2},
|
|
},
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("Failed to add pending task to ActiveTopology: %v", err)
|
|
}
|
|
|
|
// Add the same task to MaintenanceQueue
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: taskID,
|
|
Type: MaintenanceTaskType("balance"),
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
Collection: "col1",
|
|
TypedParams: &worker_pb.TaskParams{
|
|
TaskId: taskID,
|
|
Targets: []*worker_pb.TaskTarget{
|
|
{Node: "server1", DiskId: 2},
|
|
},
|
|
},
|
|
})
|
|
|
|
// Check initial available capacity on destination disk (server1:2)
|
|
// server1:2 has MaxVolumeCount=10, VolumeCount=0.
|
|
// Capacity should be 9 because AddPendingTask already reserved 1 slot.
|
|
capacityBefore := at.GetEffectiveAvailableCapacity("server1", 2)
|
|
if capacityBefore != 9 {
|
|
t.Errorf("Expected capacity 9 after AddPendingTask, got %d", capacityBefore)
|
|
}
|
|
|
|
// 5. Verify AssignTask (via GetNextTask)
|
|
mq.workers["worker1"] = &MaintenanceWorker{
|
|
ID: "worker1",
|
|
Status: "active",
|
|
Capabilities: []MaintenanceTaskType{"balance"},
|
|
MaxConcurrent: 10,
|
|
}
|
|
|
|
taskFound := mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
|
|
if taskFound == nil || taskFound.ID != taskID {
|
|
t.Fatalf("Expected to get task %s, got %+v", taskID, taskFound)
|
|
}
|
|
|
|
// Capacity should still be 9 on destination disk (server1:2)
|
|
capacityAfterAssign := at.GetEffectiveAvailableCapacity("server1", 2)
|
|
if capacityAfterAssign != 9 {
|
|
t.Errorf("Capacity should still be 9 after assignment, got %d", capacityAfterAssign)
|
|
}
|
|
|
|
// 6. Verify CompleteTask
|
|
mq.CompleteTask(taskID, "")
|
|
|
|
// Capacity should be released back to 10
|
|
capacityAfterComplete := at.GetEffectiveAvailableCapacity("server1", 2)
|
|
if capacityAfterComplete != 10 {
|
|
t.Errorf("Capacity should have returned to 10 after completion, got %d", capacityAfterComplete)
|
|
}
|
|
}
|
|
|
|
func TestMaintenanceQueue_StaleWorkerCapacityRelease(t *testing.T) {
|
|
// Setup
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 1},
|
|
},
|
|
}
|
|
mq := NewMaintenanceQueue(policy)
|
|
integration := NewMaintenanceIntegration(mq, policy)
|
|
mq.SetIntegration(integration)
|
|
at := integration.GetActiveTopology()
|
|
|
|
topologyInfo := &master_pb.TopologyInfo{
|
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
|
{
|
|
Id: "dc1",
|
|
RackInfos: []*master_pb.RackInfo{
|
|
{
|
|
Id: "rack1",
|
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
|
{
|
|
Id: "server1",
|
|
DiskInfos: map[string]*master_pb.DiskInfo{
|
|
"hdd1": {DiskId: 1, VolumeCount: 1, MaxVolumeCount: 10},
|
|
"hdd2": {DiskId: 2, VolumeCount: 0, MaxVolumeCount: 10},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
at.UpdateTopology(topologyInfo)
|
|
|
|
taskID := "stale_test_123"
|
|
at.AddPendingTask(topology.TaskSpec{
|
|
TaskID: taskID,
|
|
TaskType: topology.TaskTypeBalance,
|
|
VolumeID: 100,
|
|
VolumeSize: 1024,
|
|
Sources: []topology.TaskSourceSpec{{ServerID: "server1", DiskID: 1}},
|
|
Destinations: []topology.TaskDestinationSpec{{ServerID: "server1", DiskID: 2}},
|
|
})
|
|
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: taskID,
|
|
Type: "balance",
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
TypedParams: &worker_pb.TaskParams{
|
|
TaskId: taskID,
|
|
Targets: []*worker_pb.TaskTarget{{Node: "server1", DiskId: 2}},
|
|
},
|
|
})
|
|
|
|
mq.workers["worker1"] = &MaintenanceWorker{
|
|
ID: "worker1",
|
|
Status: "active",
|
|
Capabilities: []MaintenanceTaskType{"balance"},
|
|
MaxConcurrent: 1,
|
|
LastHeartbeat: time.Now(),
|
|
}
|
|
|
|
// Assign task
|
|
mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
|
|
|
|
// Verify capacity reserved (9 left)
|
|
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
|
|
t.Errorf("Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
|
|
}
|
|
|
|
// Make worker stale
|
|
mq.workers["worker1"].LastHeartbeat = time.Now().Add(-1 * time.Hour)
|
|
|
|
// Remove stale workers
|
|
mq.RemoveStaleWorkers(10 * time.Minute)
|
|
|
|
// Verify capacity released (back to 10)
|
|
if at.GetEffectiveAvailableCapacity("server1", 2) != 10 {
|
|
t.Errorf("Expected capacity 10 after removing stale worker, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
|
|
}
|
|
}
|
|
|
|
func TestMaintenanceManager_CancelTaskCapacityRelease(t *testing.T) {
|
|
// Setup Manager
|
|
config := DefaultMaintenanceConfig()
|
|
mm := NewMaintenanceManager(nil, config)
|
|
integration := mm.scanner.integration
|
|
mq := mm.queue
|
|
at := integration.GetActiveTopology()
|
|
|
|
topologyInfo := &master_pb.TopologyInfo{
|
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
|
{
|
|
Id: "dc1",
|
|
RackInfos: []*master_pb.RackInfo{
|
|
{
|
|
Id: "rack1",
|
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
|
{
|
|
Id: "server1",
|
|
DiskInfos: map[string]*master_pb.DiskInfo{
|
|
"hdd1": {DiskId: 1, VolumeCount: 1, MaxVolumeCount: 10},
|
|
"hdd2": {DiskId: 2, VolumeCount: 0, MaxVolumeCount: 10},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
at.UpdateTopology(topologyInfo)
|
|
|
|
taskID := "cancel_test_123"
|
|
// Note: AddPendingTask reserves capacity
|
|
at.AddPendingTask(topology.TaskSpec{
|
|
TaskID: taskID,
|
|
TaskType: topology.TaskTypeBalance,
|
|
VolumeID: 100,
|
|
VolumeSize: 1024,
|
|
Sources: []topology.TaskSourceSpec{{ServerID: "server1", DiskID: 1}},
|
|
Destinations: []topology.TaskDestinationSpec{{ServerID: "server1", DiskID: 2}},
|
|
})
|
|
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: taskID,
|
|
Type: "balance",
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
TypedParams: &worker_pb.TaskParams{
|
|
TaskId: taskID,
|
|
Targets: []*worker_pb.TaskTarget{{Node: "server1", DiskId: 2}},
|
|
},
|
|
})
|
|
|
|
// Verify capacity reserved (9 left)
|
|
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
|
|
t.Errorf("Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
|
|
}
|
|
|
|
// Cancel task
|
|
err := mm.CancelTask(taskID)
|
|
if err != nil {
|
|
t.Fatalf("Failed to cancel task: %v", err)
|
|
}
|
|
|
|
// Verify capacity released (back to 10)
|
|
if at.GetEffectiveAvailableCapacity("server1", 2) != 10 {
|
|
t.Errorf("Expected capacity 10 after cancelling task, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
|
|
}
|
|
}
|
|
|
|
type MockPersistence struct {
|
|
tasks []*MaintenanceTask
|
|
}
|
|
|
|
func (m *MockPersistence) SaveTaskState(task *MaintenanceTask) error { return nil }
|
|
func (m *MockPersistence) LoadTaskState(taskID string) (*MaintenanceTask, error) { return nil, nil }
|
|
func (m *MockPersistence) LoadAllTaskStates() ([]*MaintenanceTask, error) { return m.tasks, nil }
|
|
func (m *MockPersistence) DeleteTaskState(taskID string) error { return nil }
|
|
func (m *MockPersistence) DeleteAllTaskStates() error { return nil }
|
|
func (m *MockPersistence) CleanupCompletedTasks() error { return nil }
|
|
func (m *MockPersistence) SaveTaskPolicy(taskType string, policy *TaskPolicy) error { return nil }
|
|
|
|
func TestMaintenanceQueue_LoadTasksStartsEmpty(t *testing.T) {
|
|
// Setup
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 1},
|
|
},
|
|
}
|
|
mq := NewMaintenanceQueue(policy)
|
|
|
|
// Setup mock persistence with tasks — these should NOT be loaded
|
|
mockTask := &MaintenanceTask{
|
|
ID: "old_task_123",
|
|
Type: "balance",
|
|
Status: TaskStatusPending,
|
|
}
|
|
mq.SetPersistence(&MockPersistence{tasks: []*MaintenanceTask{mockTask}})
|
|
|
|
// LoadTasksFromPersistence should be a no-op — scanner will re-detect
|
|
err := mq.LoadTasksFromPersistence()
|
|
if err != nil {
|
|
t.Fatalf("LoadTasksFromPersistence failed: %v", err)
|
|
}
|
|
|
|
// Queue should be empty — tasks will be re-detected by scanner
|
|
stats := mq.GetStats()
|
|
if stats.TotalTasks != 0 {
|
|
t.Errorf("Expected 0 tasks after startup, got %d", stats.TotalTasks)
|
|
}
|
|
}
|
|
|
|
func TestMaintenanceQueue_RetryCapacitySync(t *testing.T) {
|
|
// Setup
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 1},
|
|
},
|
|
}
|
|
mq := NewMaintenanceQueue(policy)
|
|
integration := NewMaintenanceIntegration(mq, policy)
|
|
mq.SetIntegration(integration)
|
|
at := integration.GetActiveTopology()
|
|
|
|
topologyInfo := &master_pb.TopologyInfo{
|
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
|
{
|
|
Id: "dc1",
|
|
RackInfos: []*master_pb.RackInfo{
|
|
{
|
|
Id: "rack1",
|
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
|
{
|
|
Id: "server1",
|
|
DiskInfos: map[string]*master_pb.DiskInfo{
|
|
"hdd1": {DiskId: 1, VolumeCount: 1, MaxVolumeCount: 10},
|
|
"hdd2": {DiskId: 2, VolumeCount: 0, MaxVolumeCount: 10},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
at.UpdateTopology(topologyInfo)
|
|
|
|
taskID := "retry_test_123"
|
|
// 1. Add task
|
|
at.AddPendingTask(topology.TaskSpec{
|
|
TaskID: taskID,
|
|
TaskType: topology.TaskTypeBalance,
|
|
VolumeID: 100,
|
|
VolumeSize: 1024,
|
|
Sources: []topology.TaskSourceSpec{{ServerID: "server1", DiskID: 1}},
|
|
Destinations: []topology.TaskDestinationSpec{{ServerID: "server1", DiskID: 2}},
|
|
})
|
|
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: taskID,
|
|
Type: "balance",
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
MaxRetries: 3,
|
|
TypedParams: &worker_pb.TaskParams{
|
|
TaskId: taskID,
|
|
Sources: []*worker_pb.TaskSource{{Node: "server1", DiskId: 1}},
|
|
Targets: []*worker_pb.TaskTarget{{Node: "server1", DiskId: 2}},
|
|
},
|
|
})
|
|
|
|
mq.workers["worker1"] = &MaintenanceWorker{
|
|
ID: "worker1",
|
|
Status: "active",
|
|
Capabilities: []MaintenanceTaskType{"balance"},
|
|
MaxConcurrent: 1,
|
|
LastHeartbeat: time.Now(),
|
|
}
|
|
|
|
// 2. Assign task
|
|
mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
|
|
|
|
// Verify capacity reserved (9 left)
|
|
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
|
|
t.Errorf("Initial assignment: Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
|
|
}
|
|
|
|
// 3. Complete with error (trigger retry)
|
|
mq.CompleteTask(taskID, "simulated failure")
|
|
|
|
// 4. Verify state after failure
|
|
task := mq.tasks[taskID]
|
|
if task.Status != TaskStatusPending {
|
|
t.Errorf("Expected status pending for retry, got %v", task.Status)
|
|
}
|
|
if task.RetryCount != 1 {
|
|
t.Errorf("Expected retry count 1, got %d", task.RetryCount)
|
|
}
|
|
|
|
// 5. Verify capacity in ActiveTopology
|
|
// It should first release (back to 10) and then re-reserve (SyncTask) because it's pending again.
|
|
// So it should still be 9.
|
|
if at.GetEffectiveAvailableCapacity("server1", 2) != 9 {
|
|
t.Errorf("After retry sync: Expected capacity 9, got %d", at.GetEffectiveAvailableCapacity("server1", 2))
|
|
}
|
|
}
|
|
|
|
func TestMaintenanceQueue_AssignTaskRollback(t *testing.T) {
|
|
// Setup Policy
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 1},
|
|
},
|
|
GlobalMaxConcurrent: 10,
|
|
}
|
|
|
|
// Setup Queue and Integration
|
|
mq := NewMaintenanceQueue(policy)
|
|
integration := NewMaintenanceIntegration(mq, policy)
|
|
mq.SetIntegration(integration)
|
|
|
|
// Get Topology
|
|
at := integration.GetActiveTopology()
|
|
topologyInfo := &master_pb.TopologyInfo{
|
|
DataCenterInfos: []*master_pb.DataCenterInfo{
|
|
{
|
|
Id: "dc1",
|
|
RackInfos: []*master_pb.RackInfo{
|
|
{
|
|
Id: "rack1",
|
|
DataNodeInfos: []*master_pb.DataNodeInfo{
|
|
{
|
|
Id: "server1",
|
|
DiskInfos: map[string]*master_pb.DiskInfo{
|
|
"hdd": {
|
|
DiskId: 1,
|
|
VolumeCount: 1,
|
|
MaxVolumeCount: 1, // Only 1 slot
|
|
VolumeInfos: []*master_pb.VolumeInformationMessage{
|
|
{Id: 100, Collection: "col1"},
|
|
},
|
|
},
|
|
"hdd2": {
|
|
DiskId: 2,
|
|
VolumeCount: 0,
|
|
MaxVolumeCount: 0, // NO CAPACITY for target
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
at.UpdateTopology(topologyInfo)
|
|
|
|
taskID := "rollback_test_123"
|
|
|
|
// 1. Add task to MaintenanceQueue ONLY
|
|
// It's not in ActiveTopology, so AssignTask will fail with "pending task not found"
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: taskID,
|
|
Type: MaintenanceTaskType("balance"),
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
Collection: "col1",
|
|
TypedParams: &worker_pb.TaskParams{
|
|
TaskId: taskID,
|
|
Targets: []*worker_pb.TaskTarget{
|
|
{Node: "server1", DiskId: 2},
|
|
},
|
|
},
|
|
})
|
|
|
|
// 2. Setup worker
|
|
mq.workers["worker1"] = &MaintenanceWorker{
|
|
ID: "worker1",
|
|
Status: "active",
|
|
Capabilities: []MaintenanceTaskType{"balance"},
|
|
MaxConcurrent: 10,
|
|
}
|
|
|
|
// 3. Try to get next task
|
|
taskFound := mq.GetNextTask("worker1", []MaintenanceTaskType{"balance"})
|
|
|
|
// 4. Verify GetNextTask returned nil due to ActiveTopology.AssignTask failure
|
|
if taskFound != nil {
|
|
t.Errorf("Expected GetNextTask to return nil, got task %s", taskFound.ID)
|
|
}
|
|
|
|
// 5. Verify the task in MaintenanceQueue is rolled back to pending
|
|
mq.mutex.RLock()
|
|
task, exists := mq.tasks[taskID]
|
|
mq.mutex.RUnlock()
|
|
|
|
if !exists {
|
|
t.Fatalf("Task %s should still exist in MaintenanceQueue", taskID)
|
|
}
|
|
if task.Status != TaskStatusPending {
|
|
t.Errorf("Expected task status %v, got %v", TaskStatusPending, task.Status)
|
|
}
|
|
if task.WorkerID != "" {
|
|
t.Errorf("Expected task WorkerID to be empty, got %s", task.WorkerID)
|
|
}
|
|
if len(task.AssignmentHistory) != 0 {
|
|
t.Errorf("Expected assignment history to be empty, got %d records", len(task.AssignmentHistory))
|
|
}
|
|
|
|
// 6. Verify the task is still in pendingTasks slice
|
|
mq.mutex.RLock()
|
|
foundInPending := false
|
|
for _, pt := range mq.pendingTasks {
|
|
if pt.ID == taskID {
|
|
foundInPending = true
|
|
break
|
|
}
|
|
}
|
|
mq.mutex.RUnlock()
|
|
|
|
if !foundInPending {
|
|
t.Errorf("Task %s should still be in pendingTasks slice", taskID)
|
|
}
|
|
}
|
|
|
|
func TestGetNextTask_SkipsVolumeConflictsAcrossTypes(t *testing.T) {
|
|
policy := &MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 2},
|
|
"erasure_coding": {MaxConcurrent: 2},
|
|
"vacuum": {MaxConcurrent: 2},
|
|
},
|
|
}
|
|
|
|
mq := NewMaintenanceQueue(policy)
|
|
|
|
now := time.Now()
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: "t1",
|
|
Type: MaintenanceTaskType("balance"),
|
|
Priority: PriorityHigh,
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
ScheduledAt: now.Add(-3 * time.Second),
|
|
})
|
|
t2 := &MaintenanceTask{
|
|
ID: "t2",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
Priority: PriorityNormal,
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
Status: TaskStatusPending,
|
|
ScheduledAt: now.Add(-2 * time.Second),
|
|
}
|
|
mq.mutex.Lock()
|
|
mq.tasks[t2.ID] = t2
|
|
mq.pendingTasks = append(mq.pendingTasks, t2)
|
|
mq.mutex.Unlock()
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: "t3",
|
|
Type: MaintenanceTaskType("vacuum"),
|
|
Priority: PriorityNormal,
|
|
VolumeID: 200,
|
|
Server: "server1",
|
|
ScheduledAt: now.Add(-1 * time.Second),
|
|
})
|
|
|
|
mq.workers["worker1"] = &MaintenanceWorker{
|
|
ID: "worker1",
|
|
Status: "active",
|
|
Capabilities: []MaintenanceTaskType{"balance", "erasure_coding", "vacuum"},
|
|
MaxConcurrent: 2,
|
|
LastHeartbeat: time.Now(),
|
|
}
|
|
mq.workers["worker2"] = &MaintenanceWorker{
|
|
ID: "worker2",
|
|
Status: "active",
|
|
Capabilities: []MaintenanceTaskType{"balance", "erasure_coding", "vacuum"},
|
|
MaxConcurrent: 2,
|
|
LastHeartbeat: time.Now(),
|
|
}
|
|
|
|
task1 := mq.GetNextTask("worker1", mq.workers["worker1"].Capabilities)
|
|
if task1 == nil || task1.ID != "t1" {
|
|
t.Fatalf("Expected first assignment to be t1, got %+v", task1)
|
|
}
|
|
|
|
task2 := mq.GetNextTask("worker2", mq.workers["worker2"].Capabilities)
|
|
if task2 == nil {
|
|
t.Fatalf("Expected a second task to be assigned, got nil")
|
|
}
|
|
if task2.ID != "t3" {
|
|
t.Fatalf("Expected second assignment to skip volume 100 and pick t3, got %s", task2.ID)
|
|
}
|
|
|
|
if mq.tasks["t2"].Status != TaskStatusPending {
|
|
t.Fatalf("Expected t2 to remain pending due to volume conflict, got %s", mq.tasks["t2"].Status)
|
|
}
|
|
}
|
|
|
|
func TestAddTask_OnePendingTaskPerVolume(t *testing.T) {
|
|
mq := NewMaintenanceQueue(&MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 1},
|
|
"erasure_coding": {MaxConcurrent: 1},
|
|
},
|
|
})
|
|
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: "t1",
|
|
Type: MaintenanceTaskType("balance"),
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
})
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: "t2",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
})
|
|
|
|
mq.mutex.RLock()
|
|
defer mq.mutex.RUnlock()
|
|
|
|
if len(mq.tasks) != 1 {
|
|
t.Fatalf("Expected 1 task in queue, got %d", len(mq.tasks))
|
|
}
|
|
if len(mq.pendingTasks) != 1 {
|
|
t.Fatalf("Expected 1 pending task, got %d", len(mq.pendingTasks))
|
|
}
|
|
if _, exists := mq.tasks["t1"]; !exists {
|
|
t.Fatalf("Expected task t1 to be queued")
|
|
}
|
|
if _, exists := mq.tasks["t2"]; exists {
|
|
t.Fatalf("Did not expect task t2 to be queued due to pending volume")
|
|
}
|
|
}
|
|
|
|
func TestAddTask_RejectsWhenVolumeHasRunningTask(t *testing.T) {
|
|
mq := NewMaintenanceQueue(&MaintenancePolicy{
|
|
TaskPolicies: map[string]*worker_pb.TaskPolicy{
|
|
"balance": {MaxConcurrent: 1},
|
|
"erasure_coding": {MaxConcurrent: 1},
|
|
},
|
|
})
|
|
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: "t1",
|
|
Type: MaintenanceTaskType("balance"),
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
})
|
|
|
|
// Simulate assignment to make it active
|
|
mq.mutex.Lock()
|
|
mq.tasks["t1"].Status = TaskStatusInProgress
|
|
mq.mutex.Unlock()
|
|
|
|
mq.AddTask(&MaintenanceTask{
|
|
ID: "t2",
|
|
Type: MaintenanceTaskType("erasure_coding"),
|
|
VolumeID: 100,
|
|
Server: "server1",
|
|
})
|
|
|
|
mq.mutex.RLock()
|
|
defer mq.mutex.RUnlock()
|
|
|
|
if len(mq.tasks) != 1 {
|
|
t.Fatalf("Expected 1 task in queue, got %d", len(mq.tasks))
|
|
}
|
|
if _, exists := mq.tasks["t2"]; exists {
|
|
t.Fatalf("Did not expect task t2 to be queued due to active volume task")
|
|
}
|
|
}
|