Files
seaweedFS/weed/admin/maintenance/maintenance_manager.go
Chris Lu 13dcf445a4 Fix maintenance worker panic and add EC integration tests (#8068)
* Fix nil pointer panic in maintenance worker when receiving empty task assignment

When a worker requests a task and none are available, the admin server
sends an empty TaskAssignment message. The worker was attempting to log
the task details without checking if the TaskId was empty, causing a
nil pointer dereference when accessing taskAssign.Params.VolumeId.

This fix adds a check for empty TaskId before processing the assignment,
preventing worker crashes and improving stability in production environments.

* Add EC integration test for admin-worker maintenance system

Adds comprehensive integration test that verifies the end-to-end flow
of erasure coding maintenance tasks:
- Admin server detects volumes needing EC encoding
- Workers register and receive task assignments
- EC encoding is executed and verified in master topology
- File read-back validation confirms data integrity

The test uses unique absolute working directories for each worker to
prevent ID conflicts and ensure stable worker registration. Includes
proper cleanup and process management for reliable test execution.

* Improve maintenance system stability and task deduplication

- Add cross-type task deduplication to prevent concurrent maintenance
  operations on the same volume (EC, balance, vacuum)
- Implement HasAnyTask check in ActiveTopology for better coordination
- Increase RequestTask timeout from 5s to 30s to prevent unnecessary
  worker reconnections
- Add TaskTypeNone sentinel for generic task checks
- Update all task detectors to use HasAnyTask for conflict prevention
- Improve config persistence and schema handling

* Add GitHub Actions workflow for EC integration tests

Adds CI workflow that runs EC integration tests on push and pull requests
to master branch. The workflow:
- Triggers on changes to admin, worker, or test files
- Builds the weed binary
- Runs the EC integration test suite
- Uploads test logs as artifacts on failure for debugging

This ensures the maintenance system remains stable and worker-admin
integration is validated in CI.

* go version 1.24

* address comments

* Update maintenance_integration.go

* support seconds

* ec prioritize over balancing in tests
2026-01-20 15:07:43 -08:00

637 lines
20 KiB
Go

package maintenance
import (
"fmt"
"strings"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"
)
// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy
func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy {
policy := &worker_pb.MaintenancePolicy{
GlobalMaxConcurrent: 4,
DefaultRepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds
DefaultCheckIntervalSeconds: 12 * 3600, // 12 hours in seconds
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
}
// Load vacuum task configuration
if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil {
policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{
Enabled: vacuumConfig.Enabled,
MaxConcurrent: int32(vacuumConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{
VacuumConfig: &worker_pb.VacuumTaskConfig{
GarbageThreshold: float64(vacuumConfig.GarbageThreshold),
MinVolumeAgeHours: int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours
MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds),
},
},
}
}
// Load erasure coding task configuration
if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil {
policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{
Enabled: ecConfig.Enabled,
MaxConcurrent: int32(ecConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{
ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{
FullnessRatio: float64(ecConfig.FullnessRatio),
QuietForSeconds: int32(ecConfig.QuietForSeconds),
MinVolumeSizeMb: int32(ecConfig.MinSizeMB),
CollectionFilter: ecConfig.CollectionFilter,
},
},
}
}
// Load balance task configuration
if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil {
policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{
Enabled: balanceConfig.Enabled,
MaxConcurrent: int32(balanceConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{
BalanceConfig: &worker_pb.BalanceTaskConfig{
ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold),
MinServerCount: int32(balanceConfig.MinServerCount),
},
},
}
}
glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies))
return policy
}
// MaintenanceManager coordinates the maintenance system
type MaintenanceManager struct {
config *MaintenanceConfig
scanner *MaintenanceScanner
queue *MaintenanceQueue
adminClient AdminClient
running bool
stopChan chan struct{}
// Error handling and backoff
errorCount int
lastError error
lastErrorTime time.Time
backoffDelay time.Duration
mutex sync.RWMutex
scanInProgress bool
}
// NewMaintenanceManager creates a new maintenance manager
func NewMaintenanceManager(adminClient AdminClient, config *MaintenanceConfig) *MaintenanceManager {
if config == nil {
config = DefaultMaintenanceConfig()
}
// Use the policy from the config (which is populated from separate task files in LoadMaintenanceConfig)
policy := config.Policy
if policy == nil {
// Fallback: build policy from separate task configuration files if not already populated
policy = buildPolicyFromTaskConfigs()
}
queue := NewMaintenanceQueue(policy)
scanner := NewMaintenanceScanner(adminClient, policy, queue)
return &MaintenanceManager{
config: config,
scanner: scanner,
queue: queue,
adminClient: adminClient,
stopChan: make(chan struct{}),
backoffDelay: time.Second, // Start with 1 second backoff
}
}
// Start begins the maintenance manager
func (mm *MaintenanceManager) Start() error {
if !mm.config.Enabled {
glog.V(1).Infof("Maintenance system is disabled")
return nil
}
// Validate configuration durations to prevent ticker panics
if err := mm.validateConfig(); err != nil {
return fmt.Errorf("invalid maintenance configuration: %w", err)
}
mm.running = true
// Start background processes
go mm.scanLoop()
go mm.cleanupLoop()
go mm.topologyStatusLoop() // Periodic diagnostic logging
glog.Infof("Maintenance manager started with scan interval %ds", mm.config.ScanIntervalSeconds)
return nil
}
// validateConfig validates the maintenance configuration durations
func (mm *MaintenanceManager) validateConfig() error {
if mm.config.ScanIntervalSeconds <= 0 {
glog.Warningf("Invalid scan interval %ds, using default 30m", mm.config.ScanIntervalSeconds)
mm.config.ScanIntervalSeconds = 30 * 60 // 30 minutes in seconds
}
if mm.config.CleanupIntervalSeconds <= 0 {
glog.Warningf("Invalid cleanup interval %ds, using default 24h", mm.config.CleanupIntervalSeconds)
mm.config.CleanupIntervalSeconds = 24 * 60 * 60 // 24 hours in seconds
}
if mm.config.WorkerTimeoutSeconds <= 0 {
glog.Warningf("Invalid worker timeout %ds, using default 5m", mm.config.WorkerTimeoutSeconds)
mm.config.WorkerTimeoutSeconds = 5 * 60 // 5 minutes in seconds
}
if mm.config.TaskTimeoutSeconds <= 0 {
glog.Warningf("Invalid task timeout %ds, using default 2h", mm.config.TaskTimeoutSeconds)
mm.config.TaskTimeoutSeconds = 2 * 60 * 60 // 2 hours in seconds
}
if mm.config.RetryDelaySeconds <= 0 {
glog.Warningf("Invalid retry delay %ds, using default 15m", mm.config.RetryDelaySeconds)
mm.config.RetryDelaySeconds = 15 * 60 // 15 minutes in seconds
}
if mm.config.TaskRetentionSeconds <= 0 {
glog.Warningf("Invalid task retention %ds, using default 168h", mm.config.TaskRetentionSeconds)
mm.config.TaskRetentionSeconds = 7 * 24 * 60 * 60 // 7 days in seconds
}
return nil
}
// IsRunning returns whether the maintenance manager is currently running
func (mm *MaintenanceManager) IsRunning() bool {
return mm.running
}
// Stop terminates the maintenance manager
func (mm *MaintenanceManager) Stop() {
mm.running = false
close(mm.stopChan)
glog.Infof("Maintenance manager stopped")
}
// scanLoop periodically scans for maintenance tasks with adaptive timing
func (mm *MaintenanceManager) scanLoop() {
scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second
ticker := time.NewTicker(scanInterval)
defer ticker.Stop()
for mm.running {
select {
case <-mm.stopChan:
return
case <-ticker.C:
glog.V(1).Infof("Performing maintenance scan every %v", scanInterval)
// Use the same synchronization as TriggerScan to prevent concurrent scans
if err := mm.triggerScanInternal(false); err != nil {
glog.V(1).Infof("Scheduled scan skipped: %v", err)
}
// Adjust ticker interval based on error state (read error state safely)
currentInterval := mm.getScanInterval(scanInterval)
// Reset ticker with new interval if needed
if currentInterval != scanInterval {
ticker.Stop()
ticker = time.NewTicker(currentInterval)
}
}
}
}
// getScanInterval safely reads the current scan interval with error backoff
func (mm *MaintenanceManager) getScanInterval(baseInterval time.Duration) time.Duration {
mm.mutex.RLock()
defer mm.mutex.RUnlock()
if mm.errorCount > 0 {
// Use backoff delay when there are errors
currentInterval := mm.backoffDelay
if currentInterval > baseInterval {
// Don't make it longer than the configured interval * 10
maxInterval := baseInterval * 10
if currentInterval > maxInterval {
currentInterval = maxInterval
}
}
return currentInterval
}
return baseInterval
}
// cleanupLoop periodically cleans up old tasks and stale workers
func (mm *MaintenanceManager) cleanupLoop() {
cleanupInterval := time.Duration(mm.config.CleanupIntervalSeconds) * time.Second
ticker := time.NewTicker(cleanupInterval)
defer ticker.Stop()
for mm.running {
select {
case <-mm.stopChan:
return
case <-ticker.C:
mm.performCleanup()
}
}
}
// topologyStatusLoop periodically logs topology status for diagnostics
func (mm *MaintenanceManager) topologyStatusLoop() {
// Log topology status every 5 minutes for diagnostic purposes
statusInterval := 5 * time.Minute
ticker := time.NewTicker(statusInterval)
defer ticker.Stop()
for mm.running {
select {
case <-mm.stopChan:
return
case <-ticker.C:
mm.logTopologyStatus()
}
}
}
// logTopologyStatus logs current topology and worker status for diagnostics
func (mm *MaintenanceManager) logTopologyStatus() {
if mm.scanner == nil || mm.scanner.integration == nil {
glog.V(2).Infof("Topology status: scanner/integration not available")
return
}
activeTopology := mm.scanner.integration.GetActiveTopology()
if activeTopology == nil {
glog.V(1).Infof("Topology status: ActiveTopology is nil")
return
}
diskCount := activeTopology.GetDiskCount()
nodeCount := len(activeTopology.GetAllNodes())
// Get queue stats
stats := mm.queue.GetStats()
workerCount := len(mm.queue.GetWorkers())
mm.mutex.RLock()
errorCount := mm.errorCount
mm.mutex.RUnlock()
glog.V(0).Infof("Topology status: %d nodes, %d disks, %d workers, %d pending tasks, %d running tasks, errors: %d",
nodeCount, diskCount, workerCount,
stats.TasksByStatus[TaskStatusPending],
stats.TasksByStatus[TaskStatusInProgress]+stats.TasksByStatus[TaskStatusAssigned],
errorCount)
}
// performScan executes a maintenance scan with error handling and backoff
func (mm *MaintenanceManager) performScan() {
defer func() {
// Always reset scan in progress flag when done
mm.mutex.Lock()
mm.scanInProgress = false
mm.mutex.Unlock()
}()
glog.Infof("Starting maintenance scan...")
results, err := mm.scanner.ScanForMaintenanceTasks()
if err != nil {
// Handle scan error
mm.mutex.Lock()
mm.handleScanError(err)
mm.mutex.Unlock()
glog.Warningf("Maintenance scan failed: %v", err)
return
}
// Scan succeeded - update state and process results
mm.handleScanSuccess(results)
}
// handleScanSuccess processes successful scan results with proper lock management
func (mm *MaintenanceManager) handleScanSuccess(results []*TaskDetectionResult) {
// Update manager state first
mm.mutex.Lock()
mm.resetErrorTracking()
taskCount := len(results)
mm.mutex.Unlock()
if taskCount > 0 {
// Count tasks by type for logging (outside of lock)
taskCounts := make(map[MaintenanceTaskType]int)
for _, result := range results {
taskCounts[result.TaskType]++
}
// Add tasks to queue (no manager lock held)
mm.queue.AddTasksFromResults(results)
// Log detailed scan results
glog.Infof("Maintenance scan completed: found %d tasks", taskCount)
for taskType, count := range taskCounts {
glog.Infof(" - %s: %d tasks", taskType, count)
}
} else {
glog.Infof("Maintenance scan completed: no maintenance tasks needed")
}
}
// handleScanError handles scan errors with exponential backoff and reduced logging
func (mm *MaintenanceManager) handleScanError(err error) {
now := time.Now()
mm.errorCount++
mm.lastError = err
mm.lastErrorTime = now
// Use exponential backoff with jitter
if mm.errorCount > 1 {
mm.backoffDelay = mm.backoffDelay * 2
if mm.backoffDelay > 5*time.Minute {
mm.backoffDelay = 5 * time.Minute // Cap at 5 minutes
}
}
// Reduce log frequency based on error count and time
shouldLog := false
if mm.errorCount <= 3 {
// Log first 3 errors immediately
shouldLog = true
} else if mm.errorCount <= 10 && mm.errorCount%3 == 0 {
// Log every 3rd error for errors 4-10
shouldLog = true
} else if mm.errorCount%10 == 0 {
// Log every 10th error after that
shouldLog = true
}
if shouldLog {
// Check if it's a connection error to provide better messaging
if isConnectionError(err) {
if mm.errorCount == 1 {
glog.Errorf("Maintenance scan failed: %v (will retry with backoff)", err)
} else {
glog.Errorf("Maintenance scan still failing after %d attempts: %v (backoff: %v)",
mm.errorCount, err, mm.backoffDelay)
}
} else {
glog.Errorf("Maintenance scan failed: %v", err)
}
} else {
// Use debug level for suppressed errors
glog.V(3).Infof("Maintenance scan failed (error #%d, suppressed): %v", mm.errorCount, err)
}
}
// resetErrorTracking resets error tracking when scan succeeds
func (mm *MaintenanceManager) resetErrorTracking() {
if mm.errorCount > 0 {
glog.V(1).Infof("Maintenance scan recovered after %d failed attempts", mm.errorCount)
mm.errorCount = 0
mm.lastError = nil
mm.backoffDelay = time.Second // Reset to initial delay
}
}
// isConnectionError checks if the error is a connection-related error
func isConnectionError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
return strings.Contains(errStr, "connection refused") ||
strings.Contains(errStr, "connection error") ||
strings.Contains(errStr, "dial tcp") ||
strings.Contains(errStr, "connection timeout") ||
strings.Contains(errStr, "no route to host") ||
strings.Contains(errStr, "network unreachable")
}
// performCleanup cleans up old tasks and stale workers
func (mm *MaintenanceManager) performCleanup() {
glog.V(2).Infof("Starting maintenance cleanup")
taskRetention := time.Duration(mm.config.TaskRetentionSeconds) * time.Second
workerTimeout := time.Duration(mm.config.WorkerTimeoutSeconds) * time.Second
removedTasks := mm.queue.CleanupOldTasks(taskRetention)
removedWorkers := mm.queue.RemoveStaleWorkers(workerTimeout)
// Clean up stale pending operations (operations running for more than 4 hours)
staleOperationTimeout := 4 * time.Hour
removedOperations := 0
if mm.scanner != nil && mm.scanner.integration != nil {
pendingOps := mm.scanner.integration.GetPendingOperations()
if pendingOps != nil {
removedOperations = pendingOps.CleanupStaleOperations(staleOperationTimeout)
}
}
if removedTasks > 0 || removedWorkers > 0 || removedOperations > 0 {
glog.V(1).Infof("Cleanup completed: removed %d old tasks, %d stale workers, and %d stale operations",
removedTasks, removedWorkers, removedOperations)
}
}
// GetQueue returns the maintenance queue
func (mm *MaintenanceManager) GetQueue() *MaintenanceQueue {
return mm.queue
}
// GetConfig returns the maintenance configuration
func (mm *MaintenanceManager) GetConfig() *MaintenanceConfig {
return mm.config
}
// GetStats returns maintenance statistics
func (mm *MaintenanceManager) GetStats() *MaintenanceStats {
stats := mm.queue.GetStats()
mm.mutex.RLock()
defer mm.mutex.RUnlock()
stats.LastScanTime = time.Now() // Would need to track this properly
// Calculate next scan time based on current error state
scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second
nextScanInterval := scanInterval
if mm.errorCount > 0 {
nextScanInterval = mm.backoffDelay
maxInterval := scanInterval * 10
if nextScanInterval > maxInterval {
nextScanInterval = maxInterval
}
}
stats.NextScanTime = time.Now().Add(nextScanInterval)
return stats
}
// ReloadTaskConfigurations reloads task configurations from the current policy
func (mm *MaintenanceManager) ReloadTaskConfigurations() error {
mm.mutex.Lock()
defer mm.mutex.Unlock()
// Trigger configuration reload in the integration layer
if mm.scanner != nil && mm.scanner.integration != nil {
mm.scanner.integration.ConfigureTasksFromPolicy()
glog.V(1).Infof("Task configurations reloaded from policy")
return nil
}
return fmt.Errorf("integration not available for configuration reload")
}
// GetErrorState returns the current error state for monitoring
func (mm *MaintenanceManager) GetErrorState() (errorCount int, lastError error, backoffDelay time.Duration) {
mm.mutex.RLock()
defer mm.mutex.RUnlock()
return mm.errorCount, mm.lastError, mm.backoffDelay
}
// GetTasks returns tasks with filtering
func (mm *MaintenanceManager) GetTasks(status MaintenanceTaskStatus, taskType MaintenanceTaskType, limit int) []*MaintenanceTask {
return mm.queue.GetTasks(status, taskType, limit)
}
// GetWorkers returns all registered workers
func (mm *MaintenanceManager) GetWorkers() []*MaintenanceWorker {
return mm.queue.GetWorkers()
}
// TriggerScan manually triggers a maintenance scan
func (mm *MaintenanceManager) TriggerScan() error {
return mm.triggerScanInternal(true)
}
// triggerScanInternal handles both manual and automatic scan triggers
func (mm *MaintenanceManager) triggerScanInternal(isManual bool) error {
if !mm.running {
return fmt.Errorf("maintenance manager is not running")
}
// Prevent multiple concurrent scans
mm.mutex.Lock()
if mm.scanInProgress {
mm.mutex.Unlock()
if isManual {
glog.V(1).Infof("Manual scan already in progress, ignoring trigger request")
} else {
glog.V(2).Infof("Automatic scan already in progress, ignoring scheduled scan")
}
return fmt.Errorf("scan already in progress")
}
mm.scanInProgress = true
mm.mutex.Unlock()
go mm.performScan()
return nil
}
// UpdateConfig updates the maintenance configuration
func (mm *MaintenanceManager) UpdateConfig(config *MaintenanceConfig) error {
if config == nil {
return fmt.Errorf("config cannot be nil")
}
mm.config = config
mm.queue.policy = config.Policy
mm.scanner.policy = config.Policy
// Propagate global policy changes to individual task configuration files
if config.Policy != nil {
mm.saveTaskConfigsFromPolicy(config.Policy)
}
glog.V(1).Infof("Maintenance configuration updated")
return nil
}
// saveTaskConfigsFromPolicy propagates global policy settings to separate task configuration files
func (mm *MaintenanceManager) saveTaskConfigsFromPolicy(policy *worker_pb.MaintenancePolicy) {
if mm.queue.persistence == nil || policy == nil {
return
}
glog.V(1).Infof("Propagating maintenance policy changes to separate task configs")
for taskType, taskPolicy := range policy.TaskPolicies {
if err := mm.queue.persistence.SaveTaskPolicy(taskType, taskPolicy); err != nil {
glog.Errorf("Failed to save task policy for %s: %v", taskType, err)
}
}
}
// CancelTask cancels a pending task
func (mm *MaintenanceManager) CancelTask(taskID string) error {
mm.queue.mutex.Lock()
defer mm.queue.mutex.Unlock()
task, exists := mm.queue.tasks[taskID]
if !exists {
return fmt.Errorf("task %s not found", taskID)
}
if task.Status == TaskStatusPending {
task.Status = TaskStatusCancelled
task.CompletedAt = &[]time.Time{time.Now()}[0]
// Remove from pending tasks
for i, pendingTask := range mm.queue.pendingTasks {
if pendingTask.ID == taskID {
mm.queue.pendingTasks = append(mm.queue.pendingTasks[:i], mm.queue.pendingTasks[i+1:]...)
break
}
}
glog.V(2).Infof("Cancelled task %s", taskID)
return nil
}
return fmt.Errorf("task %s cannot be cancelled (status: %s)", taskID, task.Status)
}
// RegisterWorker registers a new worker
func (mm *MaintenanceManager) RegisterWorker(worker *MaintenanceWorker) {
mm.queue.RegisterWorker(worker)
}
// GetNextTask returns the next task for a worker
func (mm *MaintenanceManager) GetNextTask(workerID string, capabilities []MaintenanceTaskType) *MaintenanceTask {
return mm.queue.GetNextTask(workerID, capabilities)
}
// CompleteTask marks a task as completed
func (mm *MaintenanceManager) CompleteTask(taskID string, error string) {
mm.queue.CompleteTask(taskID, error)
}
// UpdateTaskProgress updates task progress
func (mm *MaintenanceManager) UpdateTaskProgress(taskID string, progress float64) {
mm.queue.UpdateTaskProgress(taskID, progress)
}
// UpdateWorkerHeartbeat updates worker heartbeat
func (mm *MaintenanceManager) UpdateWorkerHeartbeat(workerID string) {
mm.queue.UpdateWorkerHeartbeat(workerID)
}