fix: resolve ServerAddress to NodeId in maintenance task sync (#8508)
* fix: maintenance task topology lookup, retry, and stale task cleanup 1. Strip gRPC port from ServerAddress in SyncTask using ToHttpAddress() so task targets match topology disk keys (NodeId format). 2. Skip capacity check when topology has no disks yet (startup race where tasks are loaded from persistence before first topology update). 3. Don't retry permanent errors like "volume not found" - these will never succeed on retry. 4. Cancel all pending tasks for each task type before re-detection, ensuring stale proposals from previous cycles are cleaned up. This prevents stale tasks from blocking new detection and from repeatedly failing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * logs Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> * less lock scope Co-Authored-By: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -17,20 +17,30 @@ func (at *ActiveTopology) AssignTask(taskID string) error {
|
||||
return fmt.Errorf("pending task %s not found", taskID)
|
||||
}
|
||||
|
||||
// Check if all destination disks have sufficient capacity to reserve
|
||||
for _, dest := range task.Destinations {
|
||||
targetKey := fmt.Sprintf("%s:%d", dest.TargetServer, dest.TargetDisk)
|
||||
if targetDisk, exists := at.disks[targetKey]; exists {
|
||||
availableCapacity := at.getEffectiveAvailableCapacityUnsafe(targetDisk)
|
||||
// Skip capacity check if topology hasn't been populated yet
|
||||
if len(at.disks) == 0 {
|
||||
glog.Warningf("AssignTask %s: topology has no disks yet, skipping capacity check", taskID)
|
||||
} else {
|
||||
// Check if all destination disks have sufficient capacity to reserve
|
||||
for _, dest := range task.Destinations {
|
||||
targetKey := fmt.Sprintf("%s:%d", dest.TargetServer, dest.TargetDisk)
|
||||
if targetDisk, exists := at.disks[targetKey]; exists {
|
||||
availableCapacity := at.getEffectiveAvailableCapacityUnsafe(targetDisk)
|
||||
|
||||
// Check if we have enough total capacity using the improved unified comparison
|
||||
if !availableCapacity.CanAccommodate(dest.StorageChange) {
|
||||
return fmt.Errorf("insufficient capacity on target disk %s:%d. Available: %+v, Required: %+v",
|
||||
dest.TargetServer, dest.TargetDisk, availableCapacity, dest.StorageChange)
|
||||
// Check if we have enough total capacity using the improved unified comparison
|
||||
if !availableCapacity.CanAccommodate(dest.StorageChange) {
|
||||
return fmt.Errorf("insufficient capacity on target disk %s:%d. Available: %+v, Required: %+v",
|
||||
dest.TargetServer, dest.TargetDisk, availableCapacity, dest.StorageChange)
|
||||
}
|
||||
} else if dest.TargetServer != "" {
|
||||
// Fail fast if destination disk is not found in topology
|
||||
var existingKeys []string
|
||||
for k := range at.disks {
|
||||
existingKeys = append(existingKeys, k)
|
||||
}
|
||||
glog.Warningf("destination disk %s not found in topology. Existing disk keys: %v", targetKey, existingKeys)
|
||||
return fmt.Errorf("destination disk %s not found in topology", targetKey)
|
||||
}
|
||||
} else if dest.TargetServer != "" {
|
||||
// Fail fast if destination disk is not found in topology
|
||||
return fmt.Errorf("destination disk %s not found in topology", targetKey)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -87,6 +87,8 @@ func (at *ActiveTopology) UpdateTopology(topologyInfo *master_pb.TopologyInfo) e
|
||||
}
|
||||
|
||||
diskKey := fmt.Sprintf("%s:%d", nodeInfo.Id, diskInfo.DiskId)
|
||||
glog.V(2).Infof("UpdateTopology: adding disk key=%q nodeId=%q diskId=%d diskType=%q address=%q grpcPort=%d volumes=%d maxVolumes=%d",
|
||||
diskKey, nodeInfo.Id, diskInfo.DiskId, diskType, nodeInfo.Address, nodeInfo.GrpcPort, diskInfo.VolumeCount, diskInfo.MaxVolumeCount)
|
||||
node.disks[diskInfo.DiskId] = disk
|
||||
at.disks[diskKey] = disk
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user