Fix maintenance worker panic and add EC integration tests (#8068)
* Fix nil pointer panic in maintenance worker when receiving empty task assignment When a worker requests a task and none are available, the admin server sends an empty TaskAssignment message. The worker was attempting to log the task details without checking if the TaskId was empty, causing a nil pointer dereference when accessing taskAssign.Params.VolumeId. This fix adds a check for empty TaskId before processing the assignment, preventing worker crashes and improving stability in production environments. * Add EC integration test for admin-worker maintenance system Adds comprehensive integration test that verifies the end-to-end flow of erasure coding maintenance tasks: - Admin server detects volumes needing EC encoding - Workers register and receive task assignments - EC encoding is executed and verified in master topology - File read-back validation confirms data integrity The test uses unique absolute working directories for each worker to prevent ID conflicts and ensure stable worker registration. Includes proper cleanup and process management for reliable test execution. * Improve maintenance system stability and task deduplication - Add cross-type task deduplication to prevent concurrent maintenance operations on the same volume (EC, balance, vacuum) - Implement HasAnyTask check in ActiveTopology for better coordination - Increase RequestTask timeout from 5s to 30s to prevent unnecessary worker reconnections - Add TaskTypeNone sentinel for generic task checks - Update all task detectors to use HasAnyTask for conflict prevention - Improve config persistence and schema handling * Add GitHub Actions workflow for EC integration tests Adds CI workflow that runs EC integration tests on push and pull requests to master branch. The workflow: - Triggers on changes to admin, worker, or test files - Builds the weed binary - Runs the EC integration test suite - Uploads test logs as artifacts on failure for debugging This ensures the maintenance system remains stable and worker-admin integration is validated in CI. * go version 1.24 * address comments * Update maintenance_integration.go * support seconds * ec prioritize over balancing in tests
This commit is contained in:
@@ -832,8 +832,8 @@ func (c *GrpcAdminClient) RequestTask(workerID string, capabilities []types.Task
|
||||
}
|
||||
|
||||
// Wait for task assignment
|
||||
glog.V(3).Infof("WAITING FOR RESPONSE: Worker %s waiting for task assignment response (5s timeout)", workerID)
|
||||
timeout := time.NewTimer(5 * time.Second)
|
||||
glog.V(3).Infof("WAITING FOR RESPONSE: Worker %s waiting for task assignment response (30s timeout)", workerID)
|
||||
timeout := time.NewTimer(30 * time.Second)
|
||||
defer timeout.Stop()
|
||||
|
||||
for {
|
||||
@@ -841,6 +841,12 @@ func (c *GrpcAdminClient) RequestTask(workerID string, capabilities []types.Task
|
||||
case response := <-c.incoming:
|
||||
glog.V(3).Infof("RESPONSE RECEIVED: Worker %s received response from admin server: %T", workerID, response.Message)
|
||||
if taskAssign := response.GetTaskAssignment(); taskAssign != nil {
|
||||
// Validate TaskId is not empty before processing
|
||||
if taskAssign.TaskId == "" {
|
||||
glog.Warningf("Worker %s received task assignment with empty TaskId, ignoring", workerID)
|
||||
continue
|
||||
}
|
||||
|
||||
glog.V(1).Infof("Worker %s received task assignment in response: %s (type: %s, volume: %d)",
|
||||
workerID, taskAssign.TaskId, taskAssign.TaskType, taskAssign.Params.VolumeId)
|
||||
|
||||
@@ -862,7 +868,7 @@ func (c *GrpcAdminClient) RequestTask(workerID string, capabilities []types.Task
|
||||
glog.V(3).Infof("NON-TASK RESPONSE: Worker %s received non-task response: %T", workerID, response.Message)
|
||||
}
|
||||
case <-timeout.C:
|
||||
glog.V(3).Infof("TASK REQUEST TIMEOUT: Worker %s - no task assignment received within 5 seconds", workerID)
|
||||
glog.V(3).Infof("TASK REQUEST TIMEOUT: Worker %s - no task assignment received within 30 seconds", workerID)
|
||||
return nil, nil // No task available
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,6 +100,12 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
|
||||
|
||||
// Plan destination if ActiveTopology is available
|
||||
if clusterInfo.ActiveTopology != nil {
|
||||
// Check if ANY task already exists in ActiveTopology for this volume
|
||||
if clusterInfo.ActiveTopology.HasAnyTask(selectedVolume.VolumeID) {
|
||||
glog.V(2).Infof("BALANCE: Skipping volume %d, task already exists in ActiveTopology", selectedVolume.VolumeID)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
destinationPlan, err := planBalanceDestination(clusterInfo.ActiveTopology, selectedVolume)
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to plan balance destination for volume %d: %v", selectedVolume.VolumeID, err)
|
||||
|
||||
@@ -86,7 +86,7 @@ func GetConfigSpec() base.ConfigSpec {
|
||||
JSONName: "quiet_for_seconds",
|
||||
Type: config.FieldTypeInterval,
|
||||
DefaultValue: 300,
|
||||
MinValue: 60,
|
||||
MinValue: 1,
|
||||
MaxValue: 3600,
|
||||
Required: true,
|
||||
DisplayName: "Quiet Period",
|
||||
@@ -102,7 +102,7 @@ func GetConfigSpec() base.ConfigSpec {
|
||||
JSONName: "fullness_ratio",
|
||||
Type: config.FieldTypeFloat,
|
||||
DefaultValue: 0.8,
|
||||
MinValue: 0.1,
|
||||
MinValue: 0.0001,
|
||||
MaxValue: 1.0,
|
||||
Required: true,
|
||||
DisplayName: "Fullness Ratio",
|
||||
|
||||
@@ -34,7 +34,22 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
|
||||
skippedQuietTime := 0
|
||||
skippedFullness := 0
|
||||
|
||||
// Group metrics by VolumeID to handle replicas and select canonical server
|
||||
volumeGroups := make(map[uint32][]*types.VolumeHealthMetrics)
|
||||
for _, metric := range metrics {
|
||||
volumeGroups[metric.VolumeID] = append(volumeGroups[metric.VolumeID], metric)
|
||||
}
|
||||
|
||||
// Iterate over groups to check criteria and creation tasks
|
||||
for _, groupMetrics := range volumeGroups {
|
||||
// Find canonical metric (lowest Server ID) to ensure consistent task deduplication
|
||||
metric := groupMetrics[0]
|
||||
for _, m := range groupMetrics {
|
||||
if m.Server < metric.Server {
|
||||
metric = m
|
||||
}
|
||||
}
|
||||
|
||||
// Skip if already EC volume
|
||||
if metric.IsECVolume {
|
||||
skippedAlreadyEC++
|
||||
@@ -83,6 +98,12 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
|
||||
|
||||
// Plan EC destinations if ActiveTopology is available
|
||||
if clusterInfo.ActiveTopology != nil {
|
||||
// Check if ANY task already exists in ActiveTopology for this volume
|
||||
if clusterInfo.ActiveTopology.HasAnyTask(metric.VolumeID) {
|
||||
glog.V(2).Infof("EC Detection: Skipping volume %d, task already exists in ActiveTopology", metric.VolumeID)
|
||||
continue
|
||||
}
|
||||
|
||||
glog.Infof("EC Detection: ActiveTopology available, planning destinations for volume %d", metric.VolumeID)
|
||||
multiPlan, err := planECDestinations(clusterInfo.ActiveTopology, metric, ecConfig)
|
||||
if err != nil {
|
||||
|
||||
@@ -47,6 +47,14 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
|
||||
ScheduleAt: time.Now(),
|
||||
}
|
||||
|
||||
// Check if ANY task already exists in ActiveTopology for this volume
|
||||
if clusterInfo != nil && clusterInfo.ActiveTopology != nil {
|
||||
if clusterInfo.ActiveTopology.HasAnyTask(metric.VolumeID) {
|
||||
glog.V(2).Infof("VACUUM: Skipping volume %d, task already exists in ActiveTopology", metric.VolumeID)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Create typed parameters for vacuum task
|
||||
result.TypedParams = createVacuumTaskParams(result, metric, vacuumConfig, clusterInfo)
|
||||
if result.TypedParams != nil {
|
||||
|
||||
@@ -12,6 +12,11 @@ func secondsToIntervalValueUnit(totalSeconds int) (int, string) {
|
||||
return 0, "minute"
|
||||
}
|
||||
|
||||
// Preserve seconds when not divisible by minutes
|
||||
if totalSeconds < 60 || totalSeconds%60 != 0 {
|
||||
return totalSeconds, "second"
|
||||
}
|
||||
|
||||
// Check if it's evenly divisible by days
|
||||
if totalSeconds%(24*3600) == 0 {
|
||||
return totalSeconds / (24 * 3600), "day"
|
||||
@@ -35,6 +40,8 @@ func IntervalValueUnitToSeconds(value int, unit string) int {
|
||||
return value * 3600
|
||||
case "minute":
|
||||
return value * 60
|
||||
case "second":
|
||||
return value
|
||||
default:
|
||||
return value * 60 // Default to minutes
|
||||
}
|
||||
|
||||
@@ -947,6 +947,10 @@ func (w *Worker) processAdminMessage(message *worker_pb.AdminMessage) {
|
||||
w.handleTaskLogRequest(msg.TaskLogRequest)
|
||||
case *worker_pb.AdminMessage_TaskAssignment:
|
||||
taskAssign := msg.TaskAssignment
|
||||
if taskAssign.TaskId == "" {
|
||||
glog.V(1).Infof("Worker %s received empty task assignment, going to sleep", w.id)
|
||||
return
|
||||
}
|
||||
glog.V(1).Infof("Worker %s received direct task assignment %s (type: %s, volume: %d)",
|
||||
w.id, taskAssign.TaskId, taskAssign.TaskType, taskAssign.Params.VolumeId)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user