fix: resolve ServerAddress to NodeId in maintenance task sync (#8508)

* fix: maintenance task topology lookup, retry, and stale task cleanup 1. Strip gRPC port from ServerAddress in SyncTask using ToHttpAddress() so task targets match topology disk keys (NodeId format). 2. Skip capacity check when topology has no disks yet (startup race where tasks are loaded from persistence before first topology update). 3. Don't retry permanent errors like "volume not found" - these will never succeed on retry. 4. Cancel all pending tasks for each task type before re-detection, ensuring stale proposals from previous cycles are cleaned up. This prevents stale tasks from blocking new detection and from repeatedly failing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * logs Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * less lock scope Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-04 19:20:28 -08:00
parent 88e8342e44
commit c19f88eef1
4 changed files with 90 additions and 17 deletions
--- a/weed/admin/maintenance/maintenance_integration.go
+++ b/weed/admin/maintenance/maintenance_integration.go
@@ -5,6 +5,7 @@ import (

 	"github.com/seaweedfs/seaweedfs/weed/admin/topology"
 	"github.com/seaweedfs/seaweedfs/weed/glog"
+	"github.com/seaweedfs/seaweedfs/weed/pb"
 	"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
 	"github.com/seaweedfs/seaweedfs/weed/worker/tasks"
 	"github.com/seaweedfs/seaweedfs/weed/worker/types"
@@ -229,6 +230,12 @@ func (s *MaintenanceIntegration) ScanWithTaskDetectors(volumeMetrics []*types.Vo
 			continue
 		}

+		// Cancel stale pending tasks for this type before re-detection
+		maintenanceType := s.taskTypeMap[taskType]
+		if cancelled := s.maintenanceQueue.CancelPendingTasksByType(maintenanceType); cancelled > 0 {
+			glog.Infof("Cancelled %d stale pending %s tasks before re-detection", cancelled, taskType)
+		}
+
 		glog.V(2).Infof("Running detection for task type: %s", taskType)

 		results, err := detector.ScanForTasks(filteredMetrics, clusterInfo)
@@ -528,10 +535,15 @@ func (s *MaintenanceIntegration) SyncTask(task *MaintenanceTask) {
 		// Volume size is not currently used for Balance/Vacuum impact and is not stored in MaintenanceTask
 		sourceImpact, targetImpact := topology.CalculateTaskStorageImpact(topology.TaskType(string(taskType)), 0)

-		// Use unified sources and targets from TaskParams
+		// Use unified sources and targets from TaskParams.
+		// Task protos store ServerAddresses (with gRPC port, e.g., "host:port.grpcPort")
+		// but the topology indexes disks by NodeId (e.g., "host:port").
+		// Strip the gRPC port suffix via ToHttpAddress() to match the topology key.
 		for _, src := range task.TypedParams.Sources {
+			resolvedSrc := pb.ServerAddress(src.Node).ToHttpAddress()
+			glog.V(2).Infof("SyncTask %s: source proto Node=%q resolved to %q, diskId=%d", task.ID, src.Node, resolvedSrc, src.DiskId)
 			sources = append(sources, topology.TaskSource{
-				SourceServer:  src.Node,
+				SourceServer:  resolvedSrc,
 				SourceDisk:    src.DiskId,
 				StorageChange: sourceImpact,
 			})
@@ -539,8 +551,10 @@ func (s *MaintenanceIntegration) SyncTask(task *MaintenanceTask) {
 			estimatedSize += int64(src.EstimatedSize)
 		}
 		for _, target := range task.TypedParams.Targets {
+			resolvedTarget := pb.ServerAddress(target.Node).ToHttpAddress()
+			glog.V(2).Infof("SyncTask %s: target proto Node=%q resolved to %q, diskId=%d", task.ID, target.Node, resolvedTarget, target.DiskId)
 			destinations = append(destinations, topology.TaskDestination{
-				TargetServer:  target.Node,
+				TargetServer:  resolvedTarget,
 				TargetDisk:    target.DiskId,
 				StorageChange: targetImpact,
 			})