fix: resolve ServerAddress to NodeId in maintenance task sync (#8508)

* fix: maintenance task topology lookup, retry, and stale task cleanup

1. Strip gRPC port from ServerAddress in SyncTask using ToHttpAddress()
   so task targets match topology disk keys (NodeId format).

2. Skip capacity check when topology has no disks yet (startup race
   where tasks are loaded from persistence before first topology update).

3. Don't retry permanent errors like "volume not found" - these will
   never succeed on retry.

4. Cancel all pending tasks for each task type before re-detection,
   ensuring stale proposals from previous cycles are cleaned up.
   This prevents stale tasks from blocking new detection and from
   repeatedly failing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* logs

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

* less lock scope

Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Chris Lu
2026-03-04 19:20:28 -08:00
committed by GitHub
parent 88e8342e44
commit c19f88eef1
4 changed files with 90 additions and 17 deletions

View File

@@ -17,20 +17,30 @@ func (at *ActiveTopology) AssignTask(taskID string) error {
return fmt.Errorf("pending task %s not found", taskID)
}
// Check if all destination disks have sufficient capacity to reserve
for _, dest := range task.Destinations {
targetKey := fmt.Sprintf("%s:%d", dest.TargetServer, dest.TargetDisk)
if targetDisk, exists := at.disks[targetKey]; exists {
availableCapacity := at.getEffectiveAvailableCapacityUnsafe(targetDisk)
// Skip capacity check if topology hasn't been populated yet
if len(at.disks) == 0 {
glog.Warningf("AssignTask %s: topology has no disks yet, skipping capacity check", taskID)
} else {
// Check if all destination disks have sufficient capacity to reserve
for _, dest := range task.Destinations {
targetKey := fmt.Sprintf("%s:%d", dest.TargetServer, dest.TargetDisk)
if targetDisk, exists := at.disks[targetKey]; exists {
availableCapacity := at.getEffectiveAvailableCapacityUnsafe(targetDisk)
// Check if we have enough total capacity using the improved unified comparison
if !availableCapacity.CanAccommodate(dest.StorageChange) {
return fmt.Errorf("insufficient capacity on target disk %s:%d. Available: %+v, Required: %+v",
dest.TargetServer, dest.TargetDisk, availableCapacity, dest.StorageChange)
// Check if we have enough total capacity using the improved unified comparison
if !availableCapacity.CanAccommodate(dest.StorageChange) {
return fmt.Errorf("insufficient capacity on target disk %s:%d. Available: %+v, Required: %+v",
dest.TargetServer, dest.TargetDisk, availableCapacity, dest.StorageChange)
}
} else if dest.TargetServer != "" {
// Fail fast if destination disk is not found in topology
var existingKeys []string
for k := range at.disks {
existingKeys = append(existingKeys, k)
}
glog.Warningf("destination disk %s not found in topology. Existing disk keys: %v", targetKey, existingKeys)
return fmt.Errorf("destination disk %s not found in topology", targetKey)
}
} else if dest.TargetServer != "" {
// Fail fast if destination disk is not found in topology
return fmt.Errorf("destination disk %s not found in topology", targetKey)
}
}

View File

@@ -87,6 +87,8 @@ func (at *ActiveTopology) UpdateTopology(topologyInfo *master_pb.TopologyInfo) e
}
diskKey := fmt.Sprintf("%s:%d", nodeInfo.Id, diskInfo.DiskId)
glog.V(2).Infof("UpdateTopology: adding disk key=%q nodeId=%q diskId=%d diskType=%q address=%q grpcPort=%d volumes=%d maxVolumes=%d",
diskKey, nodeInfo.Id, diskInfo.DiskId, diskType, nodeInfo.Address, nodeInfo.GrpcPort, diskInfo.VolumeCount, diskInfo.MaxVolumeCount)
node.disks[diskInfo.DiskId] = disk
at.disks[diskKey] = disk
}