Fix maintenance worker panic and add EC integration tests (#8068)

* Fix nil pointer panic in maintenance worker when receiving empty task assignment When a worker requests a task and none are available, the admin server sends an empty TaskAssignment message. The worker was attempting to log the task details without checking if the TaskId was empty, causing a nil pointer dereference when accessing taskAssign.Params.VolumeId. This fix adds a check for empty TaskId before processing the assignment, preventing worker crashes and improving stability in production environments. * Add EC integration test for admin-worker maintenance system Adds comprehensive integration test that verifies the end-to-end flow of erasure coding maintenance tasks: - Admin server detects volumes needing EC encoding - Workers register and receive task assignments - EC encoding is executed and verified in master topology - File read-back validation confirms data integrity The test uses unique absolute working directories for each worker to prevent ID conflicts and ensure stable worker registration. Includes proper cleanup and process management for reliable test execution. * Improve maintenance system stability and task deduplication - Add cross-type task deduplication to prevent concurrent maintenance operations on the same volume (EC, balance, vacuum) - Implement HasAnyTask check in ActiveTopology for better coordination - Increase RequestTask timeout from 5s to 30s to prevent unnecessary worker reconnections - Add TaskTypeNone sentinel for generic task checks - Update all task detectors to use HasAnyTask for conflict prevention - Improve config persistence and schema handling * Add GitHub Actions workflow for EC integration tests Adds CI workflow that runs EC integration tests on push and pull requests to master branch. The workflow: - Triggers on changes to admin, worker, or test files - Builds the weed binary - Runs the EC integration test suite - Uploads test logs as artifacts on failure for debugging This ensures the maintenance system remains stable and worker-admin integration is validated in CI. * go version 1.24 * address comments * Update maintenance_integration.go * support seconds * ec prioritize over balancing in tests
2026-01-20 15:07:43 -08:00
parent f5bea40ab4
commit 13dcf445a4
23 changed files with 831 additions and 60 deletions
--- a/weed/admin/topology/capacity.go
+++ b/weed/admin/topology/capacity.go
@@ -3,6 +3,7 @@ package topology
 import (
 	"fmt"

+	"github.com/seaweedfs/seaweedfs/weed/glog"
 	"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
 )

@@ -83,6 +84,7 @@ func (at *ActiveTopology) GetDisksWithEffectiveCapacity(taskType TaskType, exclu

 	var available []*DiskInfo

+	glog.V(2).Infof("GetDisksWithEffectiveCapacity checking %d disks for type %s, minCapacity %d", len(at.disks), taskType, minCapacity)
 	for _, disk := range at.disks {
 		if disk.NodeID == excludeNodeID {
 			continue // Skip excluded node
@@ -115,11 +117,24 @@ func (at *ActiveTopology) GetDisksWithEffectiveCapacity(taskType TaskType, exclu
 					FreeVolumeCount:   disk.DiskInfo.DiskInfo.FreeVolumeCount,
 				}
 				diskCopy.DiskInfo = diskInfoCopy
+				diskCopy.DiskInfo.MaxVolumeCount = disk.DiskInfo.DiskInfo.MaxVolumeCount // Ensure Max is set

 				available = append(available, &diskCopy)
+			} else {
+				glog.V(2).Infof("Disk %s:%d capacity %d < %d (Max:%d, Vol:%d)", disk.NodeID, disk.DiskInfo.DiskID, effectiveCapacity.VolumeSlots, minCapacity, disk.DiskInfo.DiskInfo.MaxVolumeCount, disk.DiskInfo.DiskInfo.VolumeCount)
 			}
+		} else {
+			tasksInfo := ""
+			for _, t := range disk.pendingTasks {
+				tasksInfo += fmt.Sprintf("[P:%s,Vol:%d] ", t.TaskType, t.VolumeID)
+			}
+			for _, t := range disk.assignedTasks {
+				tasksInfo += fmt.Sprintf("[A:%s,Vol:%d] ", t.TaskType, t.VolumeID)
+			}
+			glog.V(2).Infof("Disk %s:%d unavailable. Load: %d, MaxLoad: %d. Tasks: %s", disk.NodeID, disk.DiskInfo.DiskID, len(disk.pendingTasks)+len(disk.assignedTasks), MaxConcurrentTasksPerDisk, tasksInfo)
 		}
 	}
+	glog.V(2).Infof("GetDisksWithEffectiveCapacity found %d available disks", len(available))

 	return available
 }
--- a/weed/admin/topology/task_management.go
+++ b/weed/admin/topology/task_management.go
@@ -195,12 +195,67 @@ func (at *ActiveTopology) AddPendingTask(spec TaskSpec) error {
 	at.pendingTasks[spec.TaskID] = task
 	at.assignTaskToDisk(task)

-	glog.V(2).Infof("Added pending %s task %s: volume %d, %d sources, %d destinations",
-		spec.TaskType, spec.TaskID, spec.VolumeID, len(sources), len(destinations))
+	return nil
+}
+
+// RestoreMaintenanceTask restores a task from persistent storage into the active topology
+func (at *ActiveTopology) RestoreMaintenanceTask(taskID string, volumeID uint32, taskType TaskType, status TaskStatus, sources []TaskSource, destinations []TaskDestination, estimatedSize int64) error {
+	at.mutex.Lock()
+	defer at.mutex.Unlock()
+
+	task := &taskState{
+		VolumeID:      volumeID,
+		TaskType:      taskType,
+		Status:        status,
+		StartedAt:     time.Now(), // Fallback if not provided, will be updated by heartbeats
+		EstimatedSize: estimatedSize,
+		Sources:       sources,
+		Destinations:  destinations,
+	}
+
+	if status == TaskStatusInProgress {
+		at.assignedTasks[taskID] = task
+	} else if status == TaskStatusPending {
+		at.pendingTasks[taskID] = task
+	} else {
+		return nil // Ignore other statuses for topology tracking
+	}
+
+	// Re-register task with disks for capacity tracking
+	at.assignTaskToDisk(task)
+
+	glog.V(1).Infof("Restored %s task %s in topology: volume %d, %d sources, %d destinations",
+		taskType, taskID, volumeID, len(sources), len(destinations))

 	return nil
 }

+// HasTask checks if there is any pending or assigned task for the given volume and task type.
+// If taskType is TaskTypeNone, it checks for ANY task type.
+func (at *ActiveTopology) HasTask(volumeID uint32, taskType TaskType) bool {
+	at.mutex.RLock()
+	defer at.mutex.RUnlock()
+
+	for _, task := range at.pendingTasks {
+		if task.VolumeID == volumeID && (taskType == TaskTypeNone || task.TaskType == taskType) {
+			return true
+		}
+	}
+
+	for _, task := range at.assignedTasks {
+		if task.VolumeID == volumeID && (taskType == TaskTypeNone || task.TaskType == taskType) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// HasAnyTask checks if there is any pending or assigned task for the given volume across all types.
+func (at *ActiveTopology) HasAnyTask(volumeID uint32) bool {
+	return at.HasTask(volumeID, TaskTypeNone)
+}
+
 // calculateSourceStorageImpact calculates storage impact for sources based on task type and cleanup type
 func (at *ActiveTopology) calculateSourceStorageImpact(taskType TaskType, cleanupType SourceCleanupType, volumeSize int64) StorageSlotChange {
 	switch taskType {
--- a/weed/admin/topology/types.go
+++ b/weed/admin/topology/types.go
@@ -10,6 +10,7 @@ type TaskStatus string

 // Common task type constants
 const (
+	TaskTypeNone          TaskType = ""
 	TaskTypeVacuum        TaskType = "vacuum"
 	TaskTypeBalance       TaskType = "balance"
 	TaskTypeErasureCoding TaskType = "erasure_coding"
@@ -27,11 +28,11 @@ const (
 const (
 	// MaxConcurrentTasksPerDisk defines the maximum number of concurrent tasks per disk
 	// This prevents overloading a single disk with too many simultaneous operations
-	MaxConcurrentTasksPerDisk = 2
+	MaxConcurrentTasksPerDisk = 10

 	// MaxTotalTaskLoadPerDisk defines the maximum total task load (pending + active) per disk
 	// This allows more tasks to be queued but limits the total pipeline depth
-	MaxTotalTaskLoadPerDisk = 3
+	MaxTotalTaskLoadPerDisk = 20

 	// MaxTaskLoadForECPlacement defines the maximum task load to consider a disk for EC placement
 	// This threshold ensures disks aren't overloaded when planning EC operations