Admin: misc improvements on admin server and workers. EC now works. (#7055)

* initial design * added simulation as tests * reorganized the codebase to move the simulation framework and tests into their own dedicated package * integration test. ec worker task * remove "enhanced" reference * start master, volume servers, filer Current Status ✅ Master: Healthy and running (port 9333) ✅ Filer: Healthy and running (port 8888) ✅ Volume Servers: All 6 servers running (ports 8080-8085) 🔄 Admin/Workers: Will start when dependencies are ready * generate write load * tasks are assigned * admin start wtih grpc port. worker has its own working directory * Update .gitignore * working worker and admin. Task detection is not working yet. * compiles, detection uses volumeSizeLimitMB from master * compiles * worker retries connecting to admin * build and restart * rendering pending tasks * skip task ID column * sticky worker id * test canScheduleTaskNow * worker reconnect to admin * clean up logs * worker register itself first * worker can run ec work and report status but: 1. one volume should not be repeatedly worked on. 2. ec shards needs to be distributed and source data should be deleted. * move ec task logic * listing ec shards * local copy, ec. Need to distribute. * ec is mostly working now * distribution of ec shards needs improvement * need configuration to enable ec * show ec volumes * interval field UI component * rename * integration test with vauuming * garbage percentage threshold * fix warning * display ec shard sizes * fix ec volumes list * Update ui.go * show default values * ensure correct default value * MaintenanceConfig use ConfigField * use schema defined defaults * config * reduce duplication * refactor to use BaseUIProvider * each task register its schema * checkECEncodingCandidate use ecDetector * use vacuumDetector * use volumeSizeLimitMB * remove remove * remove unused * refactor * use new framework * remove v2 reference * refactor * left menu can scroll now * The maintenance manager was not being initialized when no data directory was configured for persistent storage. * saving config * Update task_config_schema_templ.go * enable/disable tasks * protobuf encoded task configurations * fix system settings * use ui component * remove logs * interface{} Reduction * reduce interface{} * reduce interface{} * avoid from/to map * reduce interface{} * refactor * keep it DRY * added logging * debug messages * debug level * debug * show the log caller line * use configured task policy * log level * handle admin heartbeat response * Update worker.go * fix EC rack and dc count * Report task status to admin server * fix task logging, simplify interface checking, use erasure_coding constants * factor in empty volume server during task planning * volume.list adds disk id * track disk id also * fix locking scheduled and manual scanning * add active topology * simplify task detector * ec task completed, but shards are not showing up * implement ec in ec_typed.go * adjust log level * dedup * implementing ec copying shards and only ecx files * use disk id when distributing ec shards 🎯 Planning: ActiveTopology creates DestinationPlan with specific TargetDisk 📦 Task Creation: maintenance_integration.go creates ECDestination with DiskId 🚀 Task Execution: EC task passes DiskId in VolumeEcShardsCopyRequest 💾 Volume Server: Receives disk_id and stores shards on specific disk (vs.store.Locations[req.DiskId]) 📂 File System: EC shards and metadata land in the exact disk directory planned * Delete original volume from all locations * clean up existing shard locations * local encoding and distributing * Update docker/admin_integration/EC-TESTING-README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * check volume id range * simplify * fix tests * fix types * clean up logs and tests --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-30 12:38:03 -07:00
parent 64198dad83
commit 891a2fb6eb
130 changed files with 27737 additions and 4429 deletions
--- a/weed/admin/maintenance/maintenance_manager.go
+++ b/weed/admin/maintenance/maintenance_manager.go
@@ -7,8 +7,76 @@ import (
 	"time"

 	"github.com/seaweedfs/seaweedfs/weed/glog"
+	"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
+	"github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance"
+	"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
+	"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"
 )

+// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy
+func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy {
+	policy := &worker_pb.MaintenancePolicy{
+		GlobalMaxConcurrent:          4,
+		DefaultRepeatIntervalSeconds: 6 * 3600,  // 6 hours in seconds
+		DefaultCheckIntervalSeconds:  12 * 3600, // 12 hours in seconds
+		TaskPolicies:                 make(map[string]*worker_pb.TaskPolicy),
+	}
+
+	// Load vacuum task configuration
+	if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil {
+		policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{
+			Enabled:               vacuumConfig.Enabled,
+			MaxConcurrent:         int32(vacuumConfig.MaxConcurrent),
+			RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
+			CheckIntervalSeconds:  int32(vacuumConfig.ScanIntervalSeconds),
+			TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{
+				VacuumConfig: &worker_pb.VacuumTaskConfig{
+					GarbageThreshold:   float64(vacuumConfig.GarbageThreshold),
+					MinVolumeAgeHours:  int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours
+					MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds),
+				},
+			},
+		}
+	}
+
+	// Load erasure coding task configuration
+	if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil {
+		policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{
+			Enabled:               ecConfig.Enabled,
+			MaxConcurrent:         int32(ecConfig.MaxConcurrent),
+			RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
+			CheckIntervalSeconds:  int32(ecConfig.ScanIntervalSeconds),
+			TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{
+				ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{
+					FullnessRatio:    float64(ecConfig.FullnessRatio),
+					QuietForSeconds:  int32(ecConfig.QuietForSeconds),
+					MinVolumeSizeMb:  int32(ecConfig.MinSizeMB),
+					CollectionFilter: ecConfig.CollectionFilter,
+				},
+			},
+		}
+	}
+
+	// Load balance task configuration
+	if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil {
+		policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{
+			Enabled:               balanceConfig.Enabled,
+			MaxConcurrent:         int32(balanceConfig.MaxConcurrent),
+			RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
+			CheckIntervalSeconds:  int32(balanceConfig.ScanIntervalSeconds),
+			TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{
+				BalanceConfig: &worker_pb.BalanceTaskConfig{
+					ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold),
+					MinServerCount:     int32(balanceConfig.MinServerCount),
+				},
+			},
+		}
+	}
+
+	glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies))
+	return policy
+}
+
 // MaintenanceManager coordinates the maintenance system
 type MaintenanceManager struct {
 	config      *MaintenanceConfig
@@ -18,11 +86,12 @@ type MaintenanceManager struct {
 	running     bool
 	stopChan    chan struct{}
 	// Error handling and backoff
-	errorCount    int
-	lastError     error
-	lastErrorTime time.Time
-	backoffDelay  time.Duration
-	mutex         sync.RWMutex
+	errorCount     int
+	lastError      error
+	lastErrorTime  time.Time
+	backoffDelay   time.Duration
+	mutex          sync.RWMutex
+	scanInProgress bool
 }

 // NewMaintenanceManager creates a new maintenance manager
@@ -31,8 +100,15 @@ func NewMaintenanceManager(adminClient AdminClient, config *MaintenanceConfig) *
 		config = DefaultMaintenanceConfig()
 	}

-	queue := NewMaintenanceQueue(config.Policy)
-	scanner := NewMaintenanceScanner(adminClient, config.Policy, queue)
+	// Use the policy from the config (which is populated from separate task files in LoadMaintenanceConfig)
+	policy := config.Policy
+	if policy == nil {
+		// Fallback: build policy from separate task configuration files if not already populated
+		policy = buildPolicyFromTaskConfigs()
+	}
+
+	queue := NewMaintenanceQueue(policy)
+	scanner := NewMaintenanceScanner(adminClient, policy, queue)

 	return &MaintenanceManager{
 		config:       config,
@@ -125,23 +201,14 @@ func (mm *MaintenanceManager) scanLoop() {
 			return
 		case <-ticker.C:
 			glog.V(1).Infof("Performing maintenance scan every %v", scanInterval)
-			mm.performScan()

-			// Adjust ticker interval based on error state
-			mm.mutex.RLock()
-			currentInterval := scanInterval
-			if mm.errorCount > 0 {
-				// Use backoff delay when there are errors
-				currentInterval = mm.backoffDelay
-				if currentInterval > scanInterval {
-					// Don't make it longer than the configured interval * 10
-					maxInterval := scanInterval * 10
-					if currentInterval > maxInterval {
-						currentInterval = maxInterval
-					}
-				}
+			// Use the same synchronization as TriggerScan to prevent concurrent scans
+			if err := mm.triggerScanInternal(false); err != nil {
+				glog.V(1).Infof("Scheduled scan skipped: %v", err)
 			}
-			mm.mutex.RUnlock()
+
+			// Adjust ticker interval based on error state (read error state safely)
+			currentInterval := mm.getScanInterval(scanInterval)

 			// Reset ticker with new interval if needed
 			if currentInterval != scanInterval {
@@ -152,6 +219,26 @@ func (mm *MaintenanceManager) scanLoop() {
 	}
 }

+// getScanInterval safely reads the current scan interval with error backoff
+func (mm *MaintenanceManager) getScanInterval(baseInterval time.Duration) time.Duration {
+	mm.mutex.RLock()
+	defer mm.mutex.RUnlock()
+
+	if mm.errorCount > 0 {
+		// Use backoff delay when there are errors
+		currentInterval := mm.backoffDelay
+		if currentInterval > baseInterval {
+			// Don't make it longer than the configured interval * 10
+			maxInterval := baseInterval * 10
+			if currentInterval > maxInterval {
+				currentInterval = maxInterval
+			}
+		}
+		return currentInterval
+	}
+	return baseInterval
+}
+
 // cleanupLoop periodically cleans up old tasks and stale workers
 func (mm *MaintenanceManager) cleanupLoop() {
 	cleanupInterval := time.Duration(mm.config.CleanupIntervalSeconds) * time.Second
@@ -170,25 +257,54 @@ func (mm *MaintenanceManager) cleanupLoop() {

 // performScan executes a maintenance scan with error handling and backoff
 func (mm *MaintenanceManager) performScan() {
-	mm.mutex.Lock()
-	defer mm.mutex.Unlock()
+	defer func() {
+		// Always reset scan in progress flag when done
+		mm.mutex.Lock()
+		mm.scanInProgress = false
+		mm.mutex.Unlock()
+	}()

-	glog.V(2).Infof("Starting maintenance scan")
+	glog.Infof("Starting maintenance scan...")

 	results, err := mm.scanner.ScanForMaintenanceTasks()
 	if err != nil {
+		// Handle scan error
+		mm.mutex.Lock()
 		mm.handleScanError(err)
+		mm.mutex.Unlock()
+		glog.Warningf("Maintenance scan failed: %v", err)
 		return
 	}

-	// Scan succeeded, reset error tracking
-	mm.resetErrorTracking()
+	// Scan succeeded - update state and process results
+	mm.handleScanSuccess(results)
+}

-	if len(results) > 0 {
+// handleScanSuccess processes successful scan results with proper lock management
+func (mm *MaintenanceManager) handleScanSuccess(results []*TaskDetectionResult) {
+	// Update manager state first
+	mm.mutex.Lock()
+	mm.resetErrorTracking()
+	taskCount := len(results)
+	mm.mutex.Unlock()
+
+	if taskCount > 0 {
+		// Count tasks by type for logging (outside of lock)
+		taskCounts := make(map[MaintenanceTaskType]int)
+		for _, result := range results {
+			taskCounts[result.TaskType]++
+		}
+
+		// Add tasks to queue (no manager lock held)
 		mm.queue.AddTasksFromResults(results)
-		glog.V(1).Infof("Maintenance scan completed: added %d tasks", len(results))
+
+		// Log detailed scan results
+		glog.Infof("Maintenance scan completed: found %d tasks", taskCount)
+		for taskType, count := range taskCounts {
+			glog.Infof("  - %s: %d tasks", taskType, count)
+		}
 	} else {
-		glog.V(2).Infof("Maintenance scan completed: no tasks needed")
+		glog.Infof("Maintenance scan completed: no maintenance tasks needed")
 	}
 }

@@ -272,8 +388,19 @@ func (mm *MaintenanceManager) performCleanup() {
 	removedTasks := mm.queue.CleanupOldTasks(taskRetention)
 	removedWorkers := mm.queue.RemoveStaleWorkers(workerTimeout)

-	if removedTasks > 0 || removedWorkers > 0 {
-		glog.V(1).Infof("Cleanup completed: removed %d old tasks and %d stale workers", removedTasks, removedWorkers)
+	// Clean up stale pending operations (operations running for more than 4 hours)
+	staleOperationTimeout := 4 * time.Hour
+	removedOperations := 0
+	if mm.scanner != nil && mm.scanner.integration != nil {
+		pendingOps := mm.scanner.integration.GetPendingOperations()
+		if pendingOps != nil {
+			removedOperations = pendingOps.CleanupStaleOperations(staleOperationTimeout)
+		}
+	}
+
+	if removedTasks > 0 || removedWorkers > 0 || removedOperations > 0 {
+		glog.V(1).Infof("Cleanup completed: removed %d old tasks, %d stale workers, and %d stale operations",
+			removedTasks, removedWorkers, removedOperations)
 	}
 }

@@ -311,6 +438,21 @@ func (mm *MaintenanceManager) GetStats() *MaintenanceStats {
 	return stats
 }

+// ReloadTaskConfigurations reloads task configurations from the current policy
+func (mm *MaintenanceManager) ReloadTaskConfigurations() error {
+	mm.mutex.Lock()
+	defer mm.mutex.Unlock()
+
+	// Trigger configuration reload in the integration layer
+	if mm.scanner != nil && mm.scanner.integration != nil {
+		mm.scanner.integration.ConfigureTasksFromPolicy()
+		glog.V(1).Infof("Task configurations reloaded from policy")
+		return nil
+	}
+
+	return fmt.Errorf("integration not available for configuration reload")
+}
+
 // GetErrorState returns the current error state for monitoring
 func (mm *MaintenanceManager) GetErrorState() (errorCount int, lastError error, backoffDelay time.Duration) {
 	mm.mutex.RLock()
@@ -330,10 +472,29 @@ func (mm *MaintenanceManager) GetWorkers() []*MaintenanceWorker {

 // TriggerScan manually triggers a maintenance scan
 func (mm *MaintenanceManager) TriggerScan() error {
+	return mm.triggerScanInternal(true)
+}
+
+// triggerScanInternal handles both manual and automatic scan triggers
+func (mm *MaintenanceManager) triggerScanInternal(isManual bool) error {
 	if !mm.running {
 		return fmt.Errorf("maintenance manager is not running")
 	}

+	// Prevent multiple concurrent scans
+	mm.mutex.Lock()
+	if mm.scanInProgress {
+		mm.mutex.Unlock()
+		if isManual {
+			glog.V(1).Infof("Manual scan already in progress, ignoring trigger request")
+		} else {
+			glog.V(2).Infof("Automatic scan already in progress, ignoring scheduled scan")
+		}
+		return fmt.Errorf("scan already in progress")
+	}
+	mm.scanInProgress = true
+	mm.mutex.Unlock()
+
 	go mm.performScan()
 	return nil
 }