Fix issue #7880: Tasks use Volume IDs instead of ip:port (#7881)

* Fix issue #7880: Tasks use Volume IDs instead of ip:port

When volume servers are registered with custom IDs, tasks were attempting
to connect using the ID instead of the actual ip:port address, causing
connection failures.

Modified task detection logic in balance, erasure coding, and vacuum tasks
to resolve volume server IDs to their actual ip:port addresses using
ActiveTopology information.

* Use server addresses directly instead of translating from IDs

Modified VolumeHealthMetrics to include ServerAddress field populated
directly from topology DataNodeInfo.Address. Updated task detection
logic to use addresses directly without runtime lookups.

Changes:
- Added ServerAddress field to VolumeHealthMetrics
- Updated maintenance scanner to populate ServerAddress
- Modified task detection to use ServerAddress for Node fields
- Updated DestinationPlan to include TargetAddress
- Removed runtime address lookups in favor of direct address usage

* Address PR comments: add ServerAddress field, improve error handling

- Add missing ServerAddress field to VolumeHealthMetrics struct
- Add warning in vacuum detection when server not found in topology
- Improve error handling in erasure coding to abort task if sources missing
- Make vacuum task stricter by skipping if server not found in topology

* Refactor: Extract common address resolution logic into shared utility

- Created weed/worker/tasks/util/address.go with ResolveServerAddress function
- Updated balance, erasure_coding, and vacuum detection to use the shared utility
- Removed code duplication and improved maintainability
- Consistent error handling across all task types

* Fix critical issues in task address resolution

- Vacuum: Require topology availability and fail if server not found (no fallback to ID)
- Ensure all task types consistently fail early when topology is incomplete
- Prevent creation of tasks that would fail due to missing server addresses

* Address additional PR feedback

- Add validation for empty addresses in ResolveServerAddress
- Remove redundant serverAddress variable in vacuum detection
- Improve robustness of address resolution

* Improve error logging in vacuum detection

- Include actual error details in log message for better diagnostics
- Make error messages consistent with other task types
This commit is contained in:
Chris Lu
2025-12-25 16:14:05 -08:00
committed by GitHub
parent 225e3d0302
commit c260e6a22e
9 changed files with 84 additions and 14 deletions

View File

@@ -115,6 +115,7 @@ func (ms *MaintenanceScanner) getVolumeHealthMetrics() ([]*VolumeHealthMetrics,
metric := &VolumeHealthMetrics{
VolumeID: volInfo.Id,
Server: node.Id,
ServerAddress: node.Address,
DiskType: diskType, // Track which disk this volume is on
DiskId: volInfo.DiskId, // Use disk ID from volume info
DataCenter: dc.Id, // Data center from current loop
@@ -207,6 +208,7 @@ func (ms *MaintenanceScanner) convertToTaskMetrics(metrics []*VolumeHealthMetric
simplified = append(simplified, &types.VolumeHealthMetrics{
VolumeID: metric.VolumeID,
Server: metric.Server,
ServerAddress: metric.ServerAddress,
DiskType: metric.DiskType,
DiskId: metric.DiskId,
DataCenter: metric.DataCenter,

View File

@@ -362,6 +362,7 @@ type TaskDetectionResult struct {
type VolumeHealthMetrics struct {
VolumeID uint32 `json:"volume_id"`
Server string `json:"server"`
ServerAddress string `json:"server_address"`
DiskType string `json:"disk_type"` // Disk type (e.g., "hdd", "ssd") or disk path (e.g., "/data1")
DiskId uint32 `json:"disk_id"` // ID of the disk in Store.Locations array
DataCenter string `json:"data_center"` // Data center of the server

View File

@@ -110,10 +110,10 @@ func TestPendingOperations_VolumeFiltering(t *testing.T) {
// Create volume metrics
metrics := []*types.VolumeHealthMetrics{
{VolumeID: 100, Server: "node1"},
{VolumeID: 101, Server: "node2"},
{VolumeID: 102, Server: "node3"},
{VolumeID: 103, Server: "node1"},
{VolumeID: 100, Server: "node1", ServerAddress: "192.168.1.1:8080"},
{VolumeID: 101, Server: "node2", ServerAddress: "192.168.1.2:8080"},
{VolumeID: 102, Server: "node3", ServerAddress: "192.168.1.3:8080"},
{VolumeID: 103, Server: "node1", ServerAddress: "192.168.1.1:8080"},
}
// Add pending operations on volumes 101 and 103

View File

@@ -97,6 +97,7 @@ type ActiveTopology struct {
// DestinationPlan represents a planned destination for a volume/shard operation
type DestinationPlan struct {
TargetNode string `json:"target_node"`
TargetAddress string `json:"target_address"`
TargetDisk uint32 `json:"target_disk"`
TargetRack string `json:"target_rack"`
TargetDC string `json:"target_dc"`