* Fix issue #7880: Tasks use Volume IDs instead of ip:port When volume servers are registered with custom IDs, tasks were attempting to connect using the ID instead of the actual ip:port address, causing connection failures. Modified task detection logic in balance, erasure coding, and vacuum tasks to resolve volume server IDs to their actual ip:port addresses using ActiveTopology information. * Use server addresses directly instead of translating from IDs Modified VolumeHealthMetrics to include ServerAddress field populated directly from topology DataNodeInfo.Address. Updated task detection logic to use addresses directly without runtime lookups. Changes: - Added ServerAddress field to VolumeHealthMetrics - Updated maintenance scanner to populate ServerAddress - Modified task detection to use ServerAddress for Node fields - Updated DestinationPlan to include TargetAddress - Removed runtime address lookups in favor of direct address usage * Address PR comments: add ServerAddress field, improve error handling - Add missing ServerAddress field to VolumeHealthMetrics struct - Add warning in vacuum detection when server not found in topology - Improve error handling in erasure coding to abort task if sources missing - Make vacuum task stricter by skipping if server not found in topology * Refactor: Extract common address resolution logic into shared utility - Created weed/worker/tasks/util/address.go with ResolveServerAddress function - Updated balance, erasure_coding, and vacuum detection to use the shared utility - Removed code duplication and improved maintainability - Consistent error handling across all task types * Fix critical issues in task address resolution - Vacuum: Require topology availability and fail if server not found (no fallback to ID) - Ensure all task types consistently fail early when topology is incomplete - Prevent creation of tasks that would fail due to missing server addresses * Address additional PR feedback - Add validation for empty addresses in ResolveServerAddress - Remove redundant serverAddress variable in vacuum detection - Improve robustness of address resolution * Improve error logging in vacuum detection - Include actual error details in log message for better diagnostics - Make error messages consistent with other task types
234 lines
8.4 KiB
Go
234 lines
8.4 KiB
Go
package maintenance
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
|
)
|
|
|
|
// NewMaintenanceScanner creates a new maintenance scanner
|
|
func NewMaintenanceScanner(adminClient AdminClient, policy *MaintenancePolicy, queue *MaintenanceQueue) *MaintenanceScanner {
|
|
scanner := &MaintenanceScanner{
|
|
adminClient: adminClient,
|
|
policy: policy,
|
|
queue: queue,
|
|
lastScan: make(map[MaintenanceTaskType]time.Time),
|
|
}
|
|
|
|
// Initialize integration
|
|
scanner.integration = NewMaintenanceIntegration(queue, policy)
|
|
|
|
// Set up bidirectional relationship
|
|
queue.SetIntegration(scanner.integration)
|
|
|
|
glog.V(1).Infof("Initialized maintenance scanner with task system")
|
|
|
|
return scanner
|
|
}
|
|
|
|
// ScanForMaintenanceTasks analyzes the cluster and generates maintenance tasks
|
|
func (ms *MaintenanceScanner) ScanForMaintenanceTasks() ([]*TaskDetectionResult, error) {
|
|
// Get volume health metrics
|
|
volumeMetrics, err := ms.getVolumeHealthMetrics()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get volume health metrics: %w", err)
|
|
}
|
|
|
|
// Use task system for all task types
|
|
if ms.integration != nil {
|
|
// Convert metrics to task system format
|
|
taskMetrics := ms.convertToTaskMetrics(volumeMetrics)
|
|
|
|
// Update topology information for complete cluster view (including empty servers)
|
|
// This must happen before task detection to ensure EC placement can consider all servers
|
|
if ms.lastTopologyInfo != nil {
|
|
if err := ms.integration.UpdateTopologyInfo(ms.lastTopologyInfo); err != nil {
|
|
glog.Errorf("Failed to update topology info for empty servers: %v", err)
|
|
// Don't fail the scan - continue with just volume-bearing servers
|
|
} else {
|
|
glog.V(1).Infof("Updated topology info for complete cluster view including empty servers")
|
|
}
|
|
}
|
|
|
|
// Use task detection system with complete cluster information
|
|
results, err := ms.integration.ScanWithTaskDetectors(taskMetrics)
|
|
if err != nil {
|
|
glog.Errorf("Task scanning failed: %v", err)
|
|
return nil, err
|
|
}
|
|
|
|
glog.V(1).Infof("Maintenance scan completed: found %d tasks", len(results))
|
|
return results, nil
|
|
}
|
|
|
|
// No integration available
|
|
glog.Warningf("No integration available, no tasks will be scheduled")
|
|
return []*TaskDetectionResult{}, nil
|
|
}
|
|
|
|
// getVolumeHealthMetrics collects health information for all volumes
|
|
func (ms *MaintenanceScanner) getVolumeHealthMetrics() ([]*VolumeHealthMetrics, error) {
|
|
var metrics []*VolumeHealthMetrics
|
|
|
|
glog.V(1).Infof("Collecting volume health metrics from master")
|
|
err := ms.adminClient.WithMasterClient(func(client master_pb.SeaweedClient) error {
|
|
|
|
resp, err := client.VolumeList(context.Background(), &master_pb.VolumeListRequest{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if resp.TopologyInfo == nil {
|
|
glog.Warningf("No topology info received from master")
|
|
return nil
|
|
}
|
|
|
|
volumeSizeLimitBytes := uint64(resp.VolumeSizeLimitMb) * 1024 * 1024 // Convert MB to bytes
|
|
|
|
// Track all nodes discovered in topology
|
|
var allNodesInTopology []string
|
|
var nodesWithVolumes []string
|
|
var nodesWithoutVolumes []string
|
|
|
|
for _, dc := range resp.TopologyInfo.DataCenterInfos {
|
|
glog.V(2).Infof("Processing datacenter: %s", dc.Id)
|
|
for _, rack := range dc.RackInfos {
|
|
glog.V(2).Infof("Processing rack: %s in datacenter: %s", rack.Id, dc.Id)
|
|
for _, node := range rack.DataNodeInfos {
|
|
allNodesInTopology = append(allNodesInTopology, node.Id)
|
|
glog.V(2).Infof("Found volume server in topology: %s (disks: %d)", node.Id, len(node.DiskInfos))
|
|
|
|
hasVolumes := false
|
|
// Process each disk on this node
|
|
for diskType, diskInfo := range node.DiskInfos {
|
|
if len(diskInfo.VolumeInfos) > 0 {
|
|
hasVolumes = true
|
|
glog.V(2).Infof("Volume server %s disk %s has %d volumes", node.Id, diskType, len(diskInfo.VolumeInfos))
|
|
}
|
|
|
|
// Process volumes on this specific disk
|
|
for _, volInfo := range diskInfo.VolumeInfos {
|
|
metric := &VolumeHealthMetrics{
|
|
VolumeID: volInfo.Id,
|
|
Server: node.Id,
|
|
ServerAddress: node.Address,
|
|
DiskType: diskType, // Track which disk this volume is on
|
|
DiskId: volInfo.DiskId, // Use disk ID from volume info
|
|
DataCenter: dc.Id, // Data center from current loop
|
|
Rack: rack.Id, // Rack from current loop
|
|
Collection: volInfo.Collection,
|
|
Size: volInfo.Size,
|
|
DeletedBytes: volInfo.DeletedByteCount,
|
|
LastModified: time.Unix(int64(volInfo.ModifiedAtSecond), 0),
|
|
IsReadOnly: volInfo.ReadOnly,
|
|
IsECVolume: false, // Will be determined from volume structure
|
|
ReplicaCount: 1, // Will be counted
|
|
ExpectedReplicas: int(volInfo.ReplicaPlacement),
|
|
}
|
|
|
|
// Calculate derived metrics
|
|
if metric.Size > 0 {
|
|
metric.GarbageRatio = float64(metric.DeletedBytes) / float64(metric.Size)
|
|
// Calculate fullness ratio using actual volume size limit from master
|
|
metric.FullnessRatio = float64(metric.Size) / float64(volumeSizeLimitBytes)
|
|
}
|
|
metric.Age = time.Since(metric.LastModified)
|
|
|
|
glog.V(3).Infof("Volume %d on %s:%s (ID %d): size=%d, limit=%d, fullness=%.2f",
|
|
metric.VolumeID, metric.Server, metric.DiskType, metric.DiskId, metric.Size, volumeSizeLimitBytes, metric.FullnessRatio)
|
|
|
|
metrics = append(metrics, metric)
|
|
}
|
|
}
|
|
|
|
if hasVolumes {
|
|
nodesWithVolumes = append(nodesWithVolumes, node.Id)
|
|
} else {
|
|
nodesWithoutVolumes = append(nodesWithoutVolumes, node.Id)
|
|
glog.V(1).Infof("Volume server %s found in topology but has no volumes", node.Id)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
glog.Infof("Topology discovery complete:")
|
|
glog.Infof(" - Total volume servers in topology: %d (%v)", len(allNodesInTopology), allNodesInTopology)
|
|
glog.Infof(" - Volume servers with volumes: %d (%v)", len(nodesWithVolumes), nodesWithVolumes)
|
|
glog.Infof(" - Volume servers without volumes: %d (%v)", len(nodesWithoutVolumes), nodesWithoutVolumes)
|
|
|
|
// Store topology info for volume shard tracker
|
|
ms.lastTopologyInfo = resp.TopologyInfo
|
|
|
|
return nil
|
|
})
|
|
|
|
if err != nil {
|
|
glog.Errorf("Failed to get volume health metrics: %v", err)
|
|
return nil, err
|
|
}
|
|
|
|
glog.V(1).Infof("Successfully collected metrics for %d actual volumes with disk ID information", len(metrics))
|
|
|
|
// Count actual replicas and identify EC volumes
|
|
ms.enrichVolumeMetrics(metrics)
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
// enrichVolumeMetrics adds additional information like replica counts
|
|
func (ms *MaintenanceScanner) enrichVolumeMetrics(metrics []*VolumeHealthMetrics) {
|
|
// Group volumes by ID to count replicas
|
|
volumeGroups := make(map[uint32][]*VolumeHealthMetrics)
|
|
for _, metric := range metrics {
|
|
volumeGroups[metric.VolumeID] = append(volumeGroups[metric.VolumeID], metric)
|
|
}
|
|
|
|
// Update replica counts for actual volumes
|
|
for volumeID, replicas := range volumeGroups {
|
|
replicaCount := len(replicas)
|
|
for _, replica := range replicas {
|
|
replica.ReplicaCount = replicaCount
|
|
}
|
|
glog.V(3).Infof("Volume %d has %d replicas", volumeID, replicaCount)
|
|
}
|
|
|
|
// TODO: Identify EC volumes by checking volume structure
|
|
// This would require querying volume servers for EC shard information
|
|
}
|
|
|
|
// convertToTaskMetrics converts existing volume metrics to task system format
|
|
func (ms *MaintenanceScanner) convertToTaskMetrics(metrics []*VolumeHealthMetrics) []*types.VolumeHealthMetrics {
|
|
var simplified []*types.VolumeHealthMetrics
|
|
|
|
for _, metric := range metrics {
|
|
simplified = append(simplified, &types.VolumeHealthMetrics{
|
|
VolumeID: metric.VolumeID,
|
|
Server: metric.Server,
|
|
ServerAddress: metric.ServerAddress,
|
|
DiskType: metric.DiskType,
|
|
DiskId: metric.DiskId,
|
|
DataCenter: metric.DataCenter,
|
|
Rack: metric.Rack,
|
|
Collection: metric.Collection,
|
|
Size: metric.Size,
|
|
DeletedBytes: metric.DeletedBytes,
|
|
GarbageRatio: metric.GarbageRatio,
|
|
LastModified: metric.LastModified,
|
|
Age: metric.Age,
|
|
ReplicaCount: metric.ReplicaCount,
|
|
ExpectedReplicas: metric.ExpectedReplicas,
|
|
IsReadOnly: metric.IsReadOnly,
|
|
HasRemoteCopy: metric.HasRemoteCopy,
|
|
IsECVolume: metric.IsECVolume,
|
|
FullnessRatio: metric.FullnessRatio,
|
|
})
|
|
}
|
|
|
|
glog.V(2).Infof("Converted %d volume metrics with disk ID information for task detection", len(simplified))
|
|
return simplified
|
|
}
|