* Fix nil pointer panic in maintenance worker when receiving empty task assignment When a worker requests a task and none are available, the admin server sends an empty TaskAssignment message. The worker was attempting to log the task details without checking if the TaskId was empty, causing a nil pointer dereference when accessing taskAssign.Params.VolumeId. This fix adds a check for empty TaskId before processing the assignment, preventing worker crashes and improving stability in production environments. * Add EC integration test for admin-worker maintenance system Adds comprehensive integration test that verifies the end-to-end flow of erasure coding maintenance tasks: - Admin server detects volumes needing EC encoding - Workers register and receive task assignments - EC encoding is executed and verified in master topology - File read-back validation confirms data integrity The test uses unique absolute working directories for each worker to prevent ID conflicts and ensure stable worker registration. Includes proper cleanup and process management for reliable test execution. * Improve maintenance system stability and task deduplication - Add cross-type task deduplication to prevent concurrent maintenance operations on the same volume (EC, balance, vacuum) - Implement HasAnyTask check in ActiveTopology for better coordination - Increase RequestTask timeout from 5s to 30s to prevent unnecessary worker reconnections - Add TaskTypeNone sentinel for generic task checks - Update all task detectors to use HasAnyTask for conflict prevention - Improve config persistence and schema handling * Add GitHub Actions workflow for EC integration tests Adds CI workflow that runs EC integration tests on push and pull requests to master branch. The workflow: - Triggers on changes to admin, worker, or test files - Builds the weed binary - Runs the EC integration test suite - Uploads test logs as artifacts on failure for debugging This ensures the maintenance system remains stable and worker-admin integration is validated in CI. * go version 1.24 * address comments * Update maintenance_integration.go * support seconds * ec prioritize over balancing in tests
637 lines
20 KiB
Go
637 lines
20 KiB
Go
package maintenance
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"
|
|
)
|
|
|
|
// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy
|
|
func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy {
|
|
policy := &worker_pb.MaintenancePolicy{
|
|
GlobalMaxConcurrent: 4,
|
|
DefaultRepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds
|
|
DefaultCheckIntervalSeconds: 12 * 3600, // 12 hours in seconds
|
|
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
|
|
}
|
|
|
|
// Load vacuum task configuration
|
|
if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil {
|
|
policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{
|
|
Enabled: vacuumConfig.Enabled,
|
|
MaxConcurrent: int32(vacuumConfig.MaxConcurrent),
|
|
RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
|
|
CheckIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
|
|
TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{
|
|
VacuumConfig: &worker_pb.VacuumTaskConfig{
|
|
GarbageThreshold: float64(vacuumConfig.GarbageThreshold),
|
|
MinVolumeAgeHours: int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours
|
|
MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds),
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// Load erasure coding task configuration
|
|
if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil {
|
|
policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{
|
|
Enabled: ecConfig.Enabled,
|
|
MaxConcurrent: int32(ecConfig.MaxConcurrent),
|
|
RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
|
|
CheckIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
|
|
TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{
|
|
ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{
|
|
FullnessRatio: float64(ecConfig.FullnessRatio),
|
|
QuietForSeconds: int32(ecConfig.QuietForSeconds),
|
|
MinVolumeSizeMb: int32(ecConfig.MinSizeMB),
|
|
CollectionFilter: ecConfig.CollectionFilter,
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// Load balance task configuration
|
|
if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil {
|
|
policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{
|
|
Enabled: balanceConfig.Enabled,
|
|
MaxConcurrent: int32(balanceConfig.MaxConcurrent),
|
|
RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
|
|
CheckIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
|
|
TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{
|
|
BalanceConfig: &worker_pb.BalanceTaskConfig{
|
|
ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold),
|
|
MinServerCount: int32(balanceConfig.MinServerCount),
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies))
|
|
return policy
|
|
}
|
|
|
|
// MaintenanceManager coordinates the maintenance system
|
|
type MaintenanceManager struct {
|
|
config *MaintenanceConfig
|
|
scanner *MaintenanceScanner
|
|
queue *MaintenanceQueue
|
|
adminClient AdminClient
|
|
running bool
|
|
stopChan chan struct{}
|
|
// Error handling and backoff
|
|
errorCount int
|
|
lastError error
|
|
lastErrorTime time.Time
|
|
backoffDelay time.Duration
|
|
mutex sync.RWMutex
|
|
scanInProgress bool
|
|
}
|
|
|
|
// NewMaintenanceManager creates a new maintenance manager
|
|
func NewMaintenanceManager(adminClient AdminClient, config *MaintenanceConfig) *MaintenanceManager {
|
|
if config == nil {
|
|
config = DefaultMaintenanceConfig()
|
|
}
|
|
|
|
// Use the policy from the config (which is populated from separate task files in LoadMaintenanceConfig)
|
|
policy := config.Policy
|
|
if policy == nil {
|
|
// Fallback: build policy from separate task configuration files if not already populated
|
|
policy = buildPolicyFromTaskConfigs()
|
|
}
|
|
|
|
queue := NewMaintenanceQueue(policy)
|
|
scanner := NewMaintenanceScanner(adminClient, policy, queue)
|
|
|
|
return &MaintenanceManager{
|
|
config: config,
|
|
scanner: scanner,
|
|
queue: queue,
|
|
adminClient: adminClient,
|
|
stopChan: make(chan struct{}),
|
|
backoffDelay: time.Second, // Start with 1 second backoff
|
|
}
|
|
}
|
|
|
|
// Start begins the maintenance manager
|
|
func (mm *MaintenanceManager) Start() error {
|
|
if !mm.config.Enabled {
|
|
glog.V(1).Infof("Maintenance system is disabled")
|
|
return nil
|
|
}
|
|
|
|
// Validate configuration durations to prevent ticker panics
|
|
if err := mm.validateConfig(); err != nil {
|
|
return fmt.Errorf("invalid maintenance configuration: %w", err)
|
|
}
|
|
|
|
mm.running = true
|
|
|
|
// Start background processes
|
|
go mm.scanLoop()
|
|
go mm.cleanupLoop()
|
|
go mm.topologyStatusLoop() // Periodic diagnostic logging
|
|
|
|
glog.Infof("Maintenance manager started with scan interval %ds", mm.config.ScanIntervalSeconds)
|
|
return nil
|
|
}
|
|
|
|
// validateConfig validates the maintenance configuration durations
|
|
func (mm *MaintenanceManager) validateConfig() error {
|
|
if mm.config.ScanIntervalSeconds <= 0 {
|
|
glog.Warningf("Invalid scan interval %ds, using default 30m", mm.config.ScanIntervalSeconds)
|
|
mm.config.ScanIntervalSeconds = 30 * 60 // 30 minutes in seconds
|
|
}
|
|
|
|
if mm.config.CleanupIntervalSeconds <= 0 {
|
|
glog.Warningf("Invalid cleanup interval %ds, using default 24h", mm.config.CleanupIntervalSeconds)
|
|
mm.config.CleanupIntervalSeconds = 24 * 60 * 60 // 24 hours in seconds
|
|
}
|
|
|
|
if mm.config.WorkerTimeoutSeconds <= 0 {
|
|
glog.Warningf("Invalid worker timeout %ds, using default 5m", mm.config.WorkerTimeoutSeconds)
|
|
mm.config.WorkerTimeoutSeconds = 5 * 60 // 5 minutes in seconds
|
|
}
|
|
|
|
if mm.config.TaskTimeoutSeconds <= 0 {
|
|
glog.Warningf("Invalid task timeout %ds, using default 2h", mm.config.TaskTimeoutSeconds)
|
|
mm.config.TaskTimeoutSeconds = 2 * 60 * 60 // 2 hours in seconds
|
|
}
|
|
|
|
if mm.config.RetryDelaySeconds <= 0 {
|
|
glog.Warningf("Invalid retry delay %ds, using default 15m", mm.config.RetryDelaySeconds)
|
|
mm.config.RetryDelaySeconds = 15 * 60 // 15 minutes in seconds
|
|
}
|
|
|
|
if mm.config.TaskRetentionSeconds <= 0 {
|
|
glog.Warningf("Invalid task retention %ds, using default 168h", mm.config.TaskRetentionSeconds)
|
|
mm.config.TaskRetentionSeconds = 7 * 24 * 60 * 60 // 7 days in seconds
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// IsRunning returns whether the maintenance manager is currently running
|
|
func (mm *MaintenanceManager) IsRunning() bool {
|
|
return mm.running
|
|
}
|
|
|
|
// Stop terminates the maintenance manager
|
|
func (mm *MaintenanceManager) Stop() {
|
|
mm.running = false
|
|
close(mm.stopChan)
|
|
glog.Infof("Maintenance manager stopped")
|
|
}
|
|
|
|
// scanLoop periodically scans for maintenance tasks with adaptive timing
|
|
func (mm *MaintenanceManager) scanLoop() {
|
|
scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second
|
|
ticker := time.NewTicker(scanInterval)
|
|
defer ticker.Stop()
|
|
|
|
for mm.running {
|
|
select {
|
|
case <-mm.stopChan:
|
|
return
|
|
case <-ticker.C:
|
|
glog.V(1).Infof("Performing maintenance scan every %v", scanInterval)
|
|
|
|
// Use the same synchronization as TriggerScan to prevent concurrent scans
|
|
if err := mm.triggerScanInternal(false); err != nil {
|
|
glog.V(1).Infof("Scheduled scan skipped: %v", err)
|
|
}
|
|
|
|
// Adjust ticker interval based on error state (read error state safely)
|
|
currentInterval := mm.getScanInterval(scanInterval)
|
|
|
|
// Reset ticker with new interval if needed
|
|
if currentInterval != scanInterval {
|
|
ticker.Stop()
|
|
ticker = time.NewTicker(currentInterval)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// getScanInterval safely reads the current scan interval with error backoff
|
|
func (mm *MaintenanceManager) getScanInterval(baseInterval time.Duration) time.Duration {
|
|
mm.mutex.RLock()
|
|
defer mm.mutex.RUnlock()
|
|
|
|
if mm.errorCount > 0 {
|
|
// Use backoff delay when there are errors
|
|
currentInterval := mm.backoffDelay
|
|
if currentInterval > baseInterval {
|
|
// Don't make it longer than the configured interval * 10
|
|
maxInterval := baseInterval * 10
|
|
if currentInterval > maxInterval {
|
|
currentInterval = maxInterval
|
|
}
|
|
}
|
|
return currentInterval
|
|
}
|
|
return baseInterval
|
|
}
|
|
|
|
// cleanupLoop periodically cleans up old tasks and stale workers
|
|
func (mm *MaintenanceManager) cleanupLoop() {
|
|
cleanupInterval := time.Duration(mm.config.CleanupIntervalSeconds) * time.Second
|
|
ticker := time.NewTicker(cleanupInterval)
|
|
defer ticker.Stop()
|
|
|
|
for mm.running {
|
|
select {
|
|
case <-mm.stopChan:
|
|
return
|
|
case <-ticker.C:
|
|
mm.performCleanup()
|
|
}
|
|
}
|
|
}
|
|
|
|
// topologyStatusLoop periodically logs topology status for diagnostics
|
|
func (mm *MaintenanceManager) topologyStatusLoop() {
|
|
// Log topology status every 5 minutes for diagnostic purposes
|
|
statusInterval := 5 * time.Minute
|
|
ticker := time.NewTicker(statusInterval)
|
|
defer ticker.Stop()
|
|
|
|
for mm.running {
|
|
select {
|
|
case <-mm.stopChan:
|
|
return
|
|
case <-ticker.C:
|
|
mm.logTopologyStatus()
|
|
}
|
|
}
|
|
}
|
|
|
|
// logTopologyStatus logs current topology and worker status for diagnostics
|
|
func (mm *MaintenanceManager) logTopologyStatus() {
|
|
if mm.scanner == nil || mm.scanner.integration == nil {
|
|
glog.V(2).Infof("Topology status: scanner/integration not available")
|
|
return
|
|
}
|
|
|
|
activeTopology := mm.scanner.integration.GetActiveTopology()
|
|
if activeTopology == nil {
|
|
glog.V(1).Infof("Topology status: ActiveTopology is nil")
|
|
return
|
|
}
|
|
|
|
diskCount := activeTopology.GetDiskCount()
|
|
nodeCount := len(activeTopology.GetAllNodes())
|
|
|
|
// Get queue stats
|
|
stats := mm.queue.GetStats()
|
|
workerCount := len(mm.queue.GetWorkers())
|
|
|
|
mm.mutex.RLock()
|
|
errorCount := mm.errorCount
|
|
mm.mutex.RUnlock()
|
|
|
|
glog.V(0).Infof("Topology status: %d nodes, %d disks, %d workers, %d pending tasks, %d running tasks, errors: %d",
|
|
nodeCount, diskCount, workerCount,
|
|
stats.TasksByStatus[TaskStatusPending],
|
|
stats.TasksByStatus[TaskStatusInProgress]+stats.TasksByStatus[TaskStatusAssigned],
|
|
errorCount)
|
|
}
|
|
|
|
// performScan executes a maintenance scan with error handling and backoff
|
|
func (mm *MaintenanceManager) performScan() {
|
|
defer func() {
|
|
// Always reset scan in progress flag when done
|
|
mm.mutex.Lock()
|
|
mm.scanInProgress = false
|
|
mm.mutex.Unlock()
|
|
}()
|
|
|
|
glog.Infof("Starting maintenance scan...")
|
|
|
|
results, err := mm.scanner.ScanForMaintenanceTasks()
|
|
if err != nil {
|
|
// Handle scan error
|
|
mm.mutex.Lock()
|
|
mm.handleScanError(err)
|
|
mm.mutex.Unlock()
|
|
glog.Warningf("Maintenance scan failed: %v", err)
|
|
return
|
|
}
|
|
|
|
// Scan succeeded - update state and process results
|
|
mm.handleScanSuccess(results)
|
|
}
|
|
|
|
// handleScanSuccess processes successful scan results with proper lock management
|
|
func (mm *MaintenanceManager) handleScanSuccess(results []*TaskDetectionResult) {
|
|
// Update manager state first
|
|
mm.mutex.Lock()
|
|
mm.resetErrorTracking()
|
|
taskCount := len(results)
|
|
mm.mutex.Unlock()
|
|
|
|
if taskCount > 0 {
|
|
// Count tasks by type for logging (outside of lock)
|
|
taskCounts := make(map[MaintenanceTaskType]int)
|
|
for _, result := range results {
|
|
taskCounts[result.TaskType]++
|
|
}
|
|
|
|
// Add tasks to queue (no manager lock held)
|
|
mm.queue.AddTasksFromResults(results)
|
|
|
|
// Log detailed scan results
|
|
glog.Infof("Maintenance scan completed: found %d tasks", taskCount)
|
|
for taskType, count := range taskCounts {
|
|
glog.Infof(" - %s: %d tasks", taskType, count)
|
|
}
|
|
} else {
|
|
glog.Infof("Maintenance scan completed: no maintenance tasks needed")
|
|
}
|
|
}
|
|
|
|
// handleScanError handles scan errors with exponential backoff and reduced logging
|
|
func (mm *MaintenanceManager) handleScanError(err error) {
|
|
now := time.Now()
|
|
mm.errorCount++
|
|
mm.lastError = err
|
|
mm.lastErrorTime = now
|
|
|
|
// Use exponential backoff with jitter
|
|
if mm.errorCount > 1 {
|
|
mm.backoffDelay = mm.backoffDelay * 2
|
|
if mm.backoffDelay > 5*time.Minute {
|
|
mm.backoffDelay = 5 * time.Minute // Cap at 5 minutes
|
|
}
|
|
}
|
|
|
|
// Reduce log frequency based on error count and time
|
|
shouldLog := false
|
|
if mm.errorCount <= 3 {
|
|
// Log first 3 errors immediately
|
|
shouldLog = true
|
|
} else if mm.errorCount <= 10 && mm.errorCount%3 == 0 {
|
|
// Log every 3rd error for errors 4-10
|
|
shouldLog = true
|
|
} else if mm.errorCount%10 == 0 {
|
|
// Log every 10th error after that
|
|
shouldLog = true
|
|
}
|
|
|
|
if shouldLog {
|
|
// Check if it's a connection error to provide better messaging
|
|
if isConnectionError(err) {
|
|
if mm.errorCount == 1 {
|
|
glog.Errorf("Maintenance scan failed: %v (will retry with backoff)", err)
|
|
} else {
|
|
glog.Errorf("Maintenance scan still failing after %d attempts: %v (backoff: %v)",
|
|
mm.errorCount, err, mm.backoffDelay)
|
|
}
|
|
} else {
|
|
glog.Errorf("Maintenance scan failed: %v", err)
|
|
}
|
|
} else {
|
|
// Use debug level for suppressed errors
|
|
glog.V(3).Infof("Maintenance scan failed (error #%d, suppressed): %v", mm.errorCount, err)
|
|
}
|
|
}
|
|
|
|
// resetErrorTracking resets error tracking when scan succeeds
|
|
func (mm *MaintenanceManager) resetErrorTracking() {
|
|
if mm.errorCount > 0 {
|
|
glog.V(1).Infof("Maintenance scan recovered after %d failed attempts", mm.errorCount)
|
|
mm.errorCount = 0
|
|
mm.lastError = nil
|
|
mm.backoffDelay = time.Second // Reset to initial delay
|
|
}
|
|
}
|
|
|
|
// isConnectionError checks if the error is a connection-related error
|
|
func isConnectionError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
errStr := err.Error()
|
|
return strings.Contains(errStr, "connection refused") ||
|
|
strings.Contains(errStr, "connection error") ||
|
|
strings.Contains(errStr, "dial tcp") ||
|
|
strings.Contains(errStr, "connection timeout") ||
|
|
strings.Contains(errStr, "no route to host") ||
|
|
strings.Contains(errStr, "network unreachable")
|
|
}
|
|
|
|
// performCleanup cleans up old tasks and stale workers
|
|
func (mm *MaintenanceManager) performCleanup() {
|
|
glog.V(2).Infof("Starting maintenance cleanup")
|
|
|
|
taskRetention := time.Duration(mm.config.TaskRetentionSeconds) * time.Second
|
|
workerTimeout := time.Duration(mm.config.WorkerTimeoutSeconds) * time.Second
|
|
|
|
removedTasks := mm.queue.CleanupOldTasks(taskRetention)
|
|
removedWorkers := mm.queue.RemoveStaleWorkers(workerTimeout)
|
|
|
|
// Clean up stale pending operations (operations running for more than 4 hours)
|
|
staleOperationTimeout := 4 * time.Hour
|
|
removedOperations := 0
|
|
if mm.scanner != nil && mm.scanner.integration != nil {
|
|
pendingOps := mm.scanner.integration.GetPendingOperations()
|
|
if pendingOps != nil {
|
|
removedOperations = pendingOps.CleanupStaleOperations(staleOperationTimeout)
|
|
}
|
|
}
|
|
|
|
if removedTasks > 0 || removedWorkers > 0 || removedOperations > 0 {
|
|
glog.V(1).Infof("Cleanup completed: removed %d old tasks, %d stale workers, and %d stale operations",
|
|
removedTasks, removedWorkers, removedOperations)
|
|
}
|
|
}
|
|
|
|
// GetQueue returns the maintenance queue
|
|
func (mm *MaintenanceManager) GetQueue() *MaintenanceQueue {
|
|
return mm.queue
|
|
}
|
|
|
|
// GetConfig returns the maintenance configuration
|
|
func (mm *MaintenanceManager) GetConfig() *MaintenanceConfig {
|
|
return mm.config
|
|
}
|
|
|
|
// GetStats returns maintenance statistics
|
|
func (mm *MaintenanceManager) GetStats() *MaintenanceStats {
|
|
stats := mm.queue.GetStats()
|
|
|
|
mm.mutex.RLock()
|
|
defer mm.mutex.RUnlock()
|
|
|
|
stats.LastScanTime = time.Now() // Would need to track this properly
|
|
|
|
// Calculate next scan time based on current error state
|
|
scanInterval := time.Duration(mm.config.ScanIntervalSeconds) * time.Second
|
|
nextScanInterval := scanInterval
|
|
if mm.errorCount > 0 {
|
|
nextScanInterval = mm.backoffDelay
|
|
maxInterval := scanInterval * 10
|
|
if nextScanInterval > maxInterval {
|
|
nextScanInterval = maxInterval
|
|
}
|
|
}
|
|
stats.NextScanTime = time.Now().Add(nextScanInterval)
|
|
|
|
return stats
|
|
}
|
|
|
|
// ReloadTaskConfigurations reloads task configurations from the current policy
|
|
func (mm *MaintenanceManager) ReloadTaskConfigurations() error {
|
|
mm.mutex.Lock()
|
|
defer mm.mutex.Unlock()
|
|
|
|
// Trigger configuration reload in the integration layer
|
|
if mm.scanner != nil && mm.scanner.integration != nil {
|
|
mm.scanner.integration.ConfigureTasksFromPolicy()
|
|
glog.V(1).Infof("Task configurations reloaded from policy")
|
|
return nil
|
|
}
|
|
|
|
return fmt.Errorf("integration not available for configuration reload")
|
|
}
|
|
|
|
// GetErrorState returns the current error state for monitoring
|
|
func (mm *MaintenanceManager) GetErrorState() (errorCount int, lastError error, backoffDelay time.Duration) {
|
|
mm.mutex.RLock()
|
|
defer mm.mutex.RUnlock()
|
|
return mm.errorCount, mm.lastError, mm.backoffDelay
|
|
}
|
|
|
|
// GetTasks returns tasks with filtering
|
|
func (mm *MaintenanceManager) GetTasks(status MaintenanceTaskStatus, taskType MaintenanceTaskType, limit int) []*MaintenanceTask {
|
|
return mm.queue.GetTasks(status, taskType, limit)
|
|
}
|
|
|
|
// GetWorkers returns all registered workers
|
|
func (mm *MaintenanceManager) GetWorkers() []*MaintenanceWorker {
|
|
return mm.queue.GetWorkers()
|
|
}
|
|
|
|
// TriggerScan manually triggers a maintenance scan
|
|
func (mm *MaintenanceManager) TriggerScan() error {
|
|
return mm.triggerScanInternal(true)
|
|
}
|
|
|
|
// triggerScanInternal handles both manual and automatic scan triggers
|
|
func (mm *MaintenanceManager) triggerScanInternal(isManual bool) error {
|
|
if !mm.running {
|
|
return fmt.Errorf("maintenance manager is not running")
|
|
}
|
|
|
|
// Prevent multiple concurrent scans
|
|
mm.mutex.Lock()
|
|
if mm.scanInProgress {
|
|
mm.mutex.Unlock()
|
|
if isManual {
|
|
glog.V(1).Infof("Manual scan already in progress, ignoring trigger request")
|
|
} else {
|
|
glog.V(2).Infof("Automatic scan already in progress, ignoring scheduled scan")
|
|
}
|
|
return fmt.Errorf("scan already in progress")
|
|
}
|
|
mm.scanInProgress = true
|
|
mm.mutex.Unlock()
|
|
|
|
go mm.performScan()
|
|
return nil
|
|
}
|
|
|
|
// UpdateConfig updates the maintenance configuration
|
|
func (mm *MaintenanceManager) UpdateConfig(config *MaintenanceConfig) error {
|
|
if config == nil {
|
|
return fmt.Errorf("config cannot be nil")
|
|
}
|
|
|
|
mm.config = config
|
|
mm.queue.policy = config.Policy
|
|
mm.scanner.policy = config.Policy
|
|
|
|
// Propagate global policy changes to individual task configuration files
|
|
if config.Policy != nil {
|
|
mm.saveTaskConfigsFromPolicy(config.Policy)
|
|
}
|
|
|
|
glog.V(1).Infof("Maintenance configuration updated")
|
|
return nil
|
|
}
|
|
|
|
// saveTaskConfigsFromPolicy propagates global policy settings to separate task configuration files
|
|
func (mm *MaintenanceManager) saveTaskConfigsFromPolicy(policy *worker_pb.MaintenancePolicy) {
|
|
if mm.queue.persistence == nil || policy == nil {
|
|
return
|
|
}
|
|
|
|
glog.V(1).Infof("Propagating maintenance policy changes to separate task configs")
|
|
for taskType, taskPolicy := range policy.TaskPolicies {
|
|
if err := mm.queue.persistence.SaveTaskPolicy(taskType, taskPolicy); err != nil {
|
|
glog.Errorf("Failed to save task policy for %s: %v", taskType, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// CancelTask cancels a pending task
|
|
func (mm *MaintenanceManager) CancelTask(taskID string) error {
|
|
mm.queue.mutex.Lock()
|
|
defer mm.queue.mutex.Unlock()
|
|
|
|
task, exists := mm.queue.tasks[taskID]
|
|
if !exists {
|
|
return fmt.Errorf("task %s not found", taskID)
|
|
}
|
|
|
|
if task.Status == TaskStatusPending {
|
|
task.Status = TaskStatusCancelled
|
|
task.CompletedAt = &[]time.Time{time.Now()}[0]
|
|
|
|
// Remove from pending tasks
|
|
for i, pendingTask := range mm.queue.pendingTasks {
|
|
if pendingTask.ID == taskID {
|
|
mm.queue.pendingTasks = append(mm.queue.pendingTasks[:i], mm.queue.pendingTasks[i+1:]...)
|
|
break
|
|
}
|
|
}
|
|
|
|
glog.V(2).Infof("Cancelled task %s", taskID)
|
|
return nil
|
|
}
|
|
|
|
return fmt.Errorf("task %s cannot be cancelled (status: %s)", taskID, task.Status)
|
|
}
|
|
|
|
// RegisterWorker registers a new worker
|
|
func (mm *MaintenanceManager) RegisterWorker(worker *MaintenanceWorker) {
|
|
mm.queue.RegisterWorker(worker)
|
|
}
|
|
|
|
// GetNextTask returns the next task for a worker
|
|
func (mm *MaintenanceManager) GetNextTask(workerID string, capabilities []MaintenanceTaskType) *MaintenanceTask {
|
|
return mm.queue.GetNextTask(workerID, capabilities)
|
|
}
|
|
|
|
// CompleteTask marks a task as completed
|
|
func (mm *MaintenanceManager) CompleteTask(taskID string, error string) {
|
|
mm.queue.CompleteTask(taskID, error)
|
|
}
|
|
|
|
// UpdateTaskProgress updates task progress
|
|
func (mm *MaintenanceManager) UpdateTaskProgress(taskID string, progress float64) {
|
|
mm.queue.UpdateTaskProgress(taskID, progress)
|
|
}
|
|
|
|
// UpdateWorkerHeartbeat updates worker heartbeat
|
|
func (mm *MaintenanceManager) UpdateWorkerHeartbeat(workerID string) {
|
|
mm.queue.UpdateWorkerHeartbeat(workerID)
|
|
}
|