Fix maintenance worker panic and add EC integration tests (#8068)

* Fix nil pointer panic in maintenance worker when receiving empty task assignment

When a worker requests a task and none are available, the admin server
sends an empty TaskAssignment message. The worker was attempting to log
the task details without checking if the TaskId was empty, causing a
nil pointer dereference when accessing taskAssign.Params.VolumeId.

This fix adds a check for empty TaskId before processing the assignment,
preventing worker crashes and improving stability in production environments.

* Add EC integration test for admin-worker maintenance system

Adds comprehensive integration test that verifies the end-to-end flow
of erasure coding maintenance tasks:
- Admin server detects volumes needing EC encoding
- Workers register and receive task assignments
- EC encoding is executed and verified in master topology
- File read-back validation confirms data integrity

The test uses unique absolute working directories for each worker to
prevent ID conflicts and ensure stable worker registration. Includes
proper cleanup and process management for reliable test execution.

* Improve maintenance system stability and task deduplication

- Add cross-type task deduplication to prevent concurrent maintenance
  operations on the same volume (EC, balance, vacuum)
- Implement HasAnyTask check in ActiveTopology for better coordination
- Increase RequestTask timeout from 5s to 30s to prevent unnecessary
  worker reconnections
- Add TaskTypeNone sentinel for generic task checks
- Update all task detectors to use HasAnyTask for conflict prevention
- Improve config persistence and schema handling

* Add GitHub Actions workflow for EC integration tests

Adds CI workflow that runs EC integration tests on push and pull requests
to master branch. The workflow:
- Triggers on changes to admin, worker, or test files
- Builds the weed binary
- Runs the EC integration test suite
- Uploads test logs as artifacts on failure for debugging

This ensures the maintenance system remains stable and worker-admin
integration is validated in CI.

* go version 1.24

* address comments

* Update maintenance_integration.go

* support seconds

* ec prioritize over balancing in tests
This commit is contained in:
Chris Lu
2026-01-20 15:07:43 -08:00
committed by GitHub
parent f5bea40ab4
commit 13dcf445a4
23 changed files with 831 additions and 60 deletions

View File

@@ -493,3 +493,62 @@ func (s *MaintenanceIntegration) GetPendingOperations() *PendingOperations {
func (s *MaintenanceIntegration) GetActiveTopology() *topology.ActiveTopology {
return s.activeTopology
}
// SyncTask synchronizes a maintenance task with the active topology for capacity tracking
func (s *MaintenanceIntegration) SyncTask(task *MaintenanceTask) {
if s.activeTopology == nil {
return
}
// Convert task type
taskType, exists := s.revTaskTypeMap[task.Type]
if !exists {
return
}
// Convert status
var status topology.TaskStatus
switch task.Status {
case TaskStatusPending:
status = topology.TaskStatusPending
case TaskStatusAssigned, TaskStatusInProgress:
status = topology.TaskStatusInProgress
default:
return // Don't sync completed/failed/cancelled tasks
}
// Extract sources and destinations from TypedParams
var sources []topology.TaskSource
var destinations []topology.TaskDestination
var estimatedSize int64
if task.TypedParams != nil {
// Use unified sources and targets from TaskParams
for _, src := range task.TypedParams.Sources {
sources = append(sources, topology.TaskSource{
SourceServer: src.Node,
SourceDisk: src.DiskId,
})
// Sum estimated size from all sources
estimatedSize += int64(src.EstimatedSize)
}
for _, target := range task.TypedParams.Targets {
destinations = append(destinations, topology.TaskDestination{
TargetServer: target.Node,
TargetDisk: target.DiskId,
})
}
// Handle type-specific params for additional task-specific sync logic
if vacuumParams := task.TypedParams.GetVacuumParams(); vacuumParams != nil {
// TODO: Add vacuum-specific sync logic if necessary
} else if ecParams := task.TypedParams.GetErasureCodingParams(); ecParams != nil {
// TODO: Add EC-specific sync logic if necessary
} else if balanceParams := task.TypedParams.GetBalanceParams(); balanceParams != nil {
// TODO: Add balance-specific sync logic if necessary
}
}
// Restore into topology
s.activeTopology.RestoreMaintenanceTask(task.ID, task.VolumeID, topology.TaskType(string(taskType)), status, sources, destinations, estimatedSize)
}

View File

@@ -558,10 +558,29 @@ func (mm *MaintenanceManager) UpdateConfig(config *MaintenanceConfig) error {
mm.queue.policy = config.Policy
mm.scanner.policy = config.Policy
// Propagate global policy changes to individual task configuration files
if config.Policy != nil {
mm.saveTaskConfigsFromPolicy(config.Policy)
}
glog.V(1).Infof("Maintenance configuration updated")
return nil
}
// saveTaskConfigsFromPolicy propagates global policy settings to separate task configuration files
func (mm *MaintenanceManager) saveTaskConfigsFromPolicy(policy *worker_pb.MaintenancePolicy) {
if mm.queue.persistence == nil || policy == nil {
return
}
glog.V(1).Infof("Propagating maintenance policy changes to separate task configs")
for taskType, taskPolicy := range policy.TaskPolicies {
if err := mm.queue.persistence.SaveTaskPolicy(taskType, taskPolicy); err != nil {
glog.Errorf("Failed to save task policy for %s: %v", taskType, err)
}
}
}
// CancelTask cancels a pending task
func (mm *MaintenanceManager) CancelTask(taskID string) error {
mm.queue.mutex.Lock()

View File

@@ -180,6 +180,9 @@ type TaskPersistence interface {
LoadAllTaskStates() ([]*MaintenanceTask, error)
DeleteTaskState(taskID string) error
CleanupCompletedTasks() error
// Policy persistence
SaveTaskPolicy(taskType string, policy *TaskPolicy) error
}
// Default configuration values