Admin UI: Fetch task logs (#7114)

* show task details * loading tasks * task UI works * generic rendering * rendering the export link * removing placementConflicts from task parameters * remove TaskSourceLocation * remove "Server ID" column * rendering balance task source * sources and targets * fix ec task generation * move info * render timeline * simplified worker id * simplify * read task logs from worker * isValidTaskID * address comments * Update weed/worker/tasks/balance/execution.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update weed/worker/tasks/erasure_coding/ec_task.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update weed/worker/tasks/task_log_handler.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix shard ids * plan distributing shard id * rendering planned shards in task details * remove Conflicts * worker logs correctly * pass in dc and rack * task logging * Update weed/admin/maintenance/maintenance_queue.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * display log details * logs have fields now * sort field keys * fix link * fix collection filtering * avoid hard coded ec shard counts --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-09 21:47:29 -07:00
parent 3ac2a2e22d
commit 25bbf4c3d4
52 changed files with 7307 additions and 2004 deletions
--- a/weed/admin/maintenance/maintenance_queue.go
+++ b/weed/admin/maintenance/maintenance_queue.go
@@ -7,7 +7,6 @@ import (
 	"time"

 	"github.com/seaweedfs/seaweedfs/weed/glog"
-	"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
 )

 // NewMaintenanceQueue creates a new maintenance queue
@@ -27,6 +26,102 @@ func (mq *MaintenanceQueue) SetIntegration(integration *MaintenanceIntegration)
 	glog.V(1).Infof("Maintenance queue configured with integration")
 }

+// SetPersistence sets the task persistence interface
+func (mq *MaintenanceQueue) SetPersistence(persistence TaskPersistence) {
+	mq.persistence = persistence
+	glog.V(1).Infof("Maintenance queue configured with task persistence")
+}
+
+// LoadTasksFromPersistence loads tasks from persistent storage on startup
+func (mq *MaintenanceQueue) LoadTasksFromPersistence() error {
+	if mq.persistence == nil {
+		glog.V(1).Infof("No task persistence configured, skipping task loading")
+		return nil
+	}
+
+	mq.mutex.Lock()
+	defer mq.mutex.Unlock()
+
+	glog.Infof("Loading tasks from persistence...")
+
+	tasks, err := mq.persistence.LoadAllTaskStates()
+	if err != nil {
+		return fmt.Errorf("failed to load task states: %w", err)
+	}
+
+	glog.Infof("DEBUG LoadTasksFromPersistence: Found %d tasks in persistence", len(tasks))
+
+	// Reset task maps
+	mq.tasks = make(map[string]*MaintenanceTask)
+	mq.pendingTasks = make([]*MaintenanceTask, 0)
+
+	// Load tasks by status
+	for _, task := range tasks {
+		glog.Infof("DEBUG LoadTasksFromPersistence: Loading task %s (type: %s, status: %s, scheduled: %v)", task.ID, task.Type, task.Status, task.ScheduledAt)
+		mq.tasks[task.ID] = task
+
+		switch task.Status {
+		case TaskStatusPending:
+			glog.Infof("DEBUG LoadTasksFromPersistence: Adding task %s to pending queue", task.ID)
+			mq.pendingTasks = append(mq.pendingTasks, task)
+		case TaskStatusAssigned, TaskStatusInProgress:
+			// For assigned/in-progress tasks, we need to check if the worker is still available
+			// If not, we should fail them and make them eligible for retry
+			if task.WorkerID != "" {
+				if _, exists := mq.workers[task.WorkerID]; !exists {
+					glog.Warningf("Task %s was assigned to unavailable worker %s, marking as failed", task.ID, task.WorkerID)
+					task.Status = TaskStatusFailed
+					task.Error = "Worker unavailable after restart"
+					completedTime := time.Now()
+					task.CompletedAt = &completedTime
+
+					// Check if it should be retried
+					if task.RetryCount < task.MaxRetries {
+						task.RetryCount++
+						task.Status = TaskStatusPending
+						task.WorkerID = ""
+						task.StartedAt = nil
+						task.CompletedAt = nil
+						task.Error = ""
+						task.ScheduledAt = time.Now().Add(1 * time.Minute) // Retry after restart delay
+						glog.Infof("DEBUG LoadTasksFromPersistence: Retrying task %s, adding to pending queue", task.ID)
+						mq.pendingTasks = append(mq.pendingTasks, task)
+					}
+				}
+			}
+		}
+	}
+
+	// Sort pending tasks by priority and schedule time
+	sort.Slice(mq.pendingTasks, func(i, j int) bool {
+		if mq.pendingTasks[i].Priority != mq.pendingTasks[j].Priority {
+			return mq.pendingTasks[i].Priority > mq.pendingTasks[j].Priority
+		}
+		return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
+	})
+
+	glog.Infof("Loaded %d tasks from persistence (%d pending)", len(tasks), len(mq.pendingTasks))
+	return nil
+}
+
+// saveTaskState saves a task to persistent storage
+func (mq *MaintenanceQueue) saveTaskState(task *MaintenanceTask) {
+	if mq.persistence != nil {
+		if err := mq.persistence.SaveTaskState(task); err != nil {
+			glog.Errorf("Failed to save task state for %s: %v", task.ID, err)
+		}
+	}
+}
+
+// cleanupCompletedTasks removes old completed tasks beyond the retention limit
+func (mq *MaintenanceQueue) cleanupCompletedTasks() {
+	if mq.persistence != nil {
+		if err := mq.persistence.CleanupCompletedTasks(); err != nil {
+			glog.Errorf("Failed to cleanup completed tasks: %v", err)
+		}
+	}
+}
+
 // AddTask adds a new maintenance task to the queue with deduplication
 func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
 	mq.mutex.Lock()
@@ -44,6 +139,18 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
 	task.CreatedAt = time.Now()
 	task.MaxRetries = 3 // Default retry count

+	// Initialize assignment history and set creation context
+	task.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
+	if task.CreatedBy == "" {
+		task.CreatedBy = "maintenance-system"
+	}
+	if task.CreationContext == "" {
+		task.CreationContext = "Automatic task creation based on system monitoring"
+	}
+	if task.Tags == nil {
+		task.Tags = make(map[string]string)
+	}
+
 	mq.tasks[task.ID] = task
 	mq.pendingTasks = append(mq.pendingTasks, task)

@@ -55,6 +162,9 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
 		return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
 	})

+	// Save task state to persistence
+	mq.saveTaskState(task)
+
 	scheduleInfo := ""
 	if !task.ScheduledAt.IsZero() && time.Until(task.ScheduledAt) > time.Minute {
 		scheduleInfo = fmt.Sprintf(", scheduled for %v", task.ScheduledAt.Format("15:04:05"))
@@ -143,7 +253,11 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena

 		// Check if this task type needs a cooldown period
 		if !mq.canScheduleTaskNow(task) {
-			glog.V(3).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met", task.ID, task.Type, workerID)
+			// Add detailed diagnostic information
+			runningCount := mq.GetRunningTaskCount(task.Type)
+			maxConcurrent := mq.getMaxConcurrentForTaskType(task.Type)
+			glog.V(2).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met (running: %d, max: %d)",
+				task.ID, task.Type, workerID, runningCount, maxConcurrent)
 			continue
 		}

@@ -172,6 +286,26 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
 		return nil
 	}

+	// Record assignment history
+	workerAddress := ""
+	if worker, exists := mq.workers[workerID]; exists {
+		workerAddress = worker.Address
+	}
+
+	// Create assignment record
+	assignmentRecord := &TaskAssignmentRecord{
+		WorkerID:      workerID,
+		WorkerAddress: workerAddress,
+		AssignedAt:    now,
+		Reason:        "Task assigned to available worker",
+	}
+
+	// Initialize assignment history if nil
+	if selectedTask.AssignmentHistory == nil {
+		selectedTask.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
+	}
+	selectedTask.AssignmentHistory = append(selectedTask.AssignmentHistory, assignmentRecord)
+
 	// Assign the task
 	selectedTask.Status = TaskStatusAssigned
 	selectedTask.WorkerID = workerID
@@ -188,6 +322,9 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
 	// Track pending operation
 	mq.trackPendingOperation(selectedTask)

+	// Save task state after assignment
+	mq.saveTaskState(selectedTask)
+
 	glog.Infof("Task assigned: %s (%s) → worker %s (volume %d, server %s)",
 		selectedTask.ID, selectedTask.Type, workerID, selectedTask.VolumeID, selectedTask.Server)

@@ -220,6 +357,17 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {

 		// Check if task should be retried
 		if task.RetryCount < task.MaxRetries {
+			// Record unassignment due to failure/retry
+			if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
+				lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
+				if lastAssignment.UnassignedAt == nil {
+					unassignedTime := completedTime
+					lastAssignment.UnassignedAt = &unassignedTime
+					lastAssignment.Reason = fmt.Sprintf("Task failed, scheduling retry (attempt %d/%d): %s",
+						task.RetryCount+1, task.MaxRetries, error)
+				}
+			}
+
 			task.RetryCount++
 			task.Status = TaskStatusPending
 			task.WorkerID = ""
@@ -229,15 +377,31 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
 			task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay

 			mq.pendingTasks = append(mq.pendingTasks, task)
+			// Save task state after retry setup
+			mq.saveTaskState(task)
 			glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s",
 				taskID, task.Type, task.RetryCount, task.MaxRetries, task.WorkerID, duration, error)
 		} else {
+			// Record unassignment due to permanent failure
+			if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
+				lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
+				if lastAssignment.UnassignedAt == nil {
+					unassignedTime := completedTime
+					lastAssignment.UnassignedAt = &unassignedTime
+					lastAssignment.Reason = fmt.Sprintf("Task failed permanently after %d retries: %s", task.MaxRetries, error)
+				}
+			}
+
+			// Save task state after permanent failure
+			mq.saveTaskState(task)
 			glog.Errorf("Task failed permanently: %s (%s) worker %s, duration %v, after %d retries: %s",
 				taskID, task.Type, task.WorkerID, duration, task.MaxRetries, error)
 		}
 	} else {
 		task.Status = TaskStatusCompleted
 		task.Progress = 100
+		// Save task state after successful completion
+		mq.saveTaskState(task)
 		glog.Infof("Task completed: %s (%s) worker %s, duration %v, volume %d",
 			taskID, task.Type, task.WorkerID, duration, task.VolumeID)
 	}
@@ -257,6 +421,14 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
 	if task.Status != TaskStatusPending {
 		mq.removePendingOperation(taskID)
 	}
+
+	// Periodically cleanup old completed tasks (every 10th completion)
+	if task.Status == TaskStatusCompleted {
+		// Simple counter-based trigger for cleanup
+		if len(mq.tasks)%10 == 0 {
+			go mq.cleanupCompletedTasks()
+		}
+	}
 }

 // UpdateTaskProgress updates the progress of a running task
@@ -283,6 +455,11 @@ func (mq *MaintenanceQueue) UpdateTaskProgress(taskID string, progress float64)
 			glog.V(1).Infof("Task progress: %s (%s) worker %s, %.1f%% complete",
 				taskID, task.Type, task.WorkerID, progress)
 		}
+
+		// Save task state after progress update
+		if progress == 0 || progress >= 100 || progress-oldProgress >= 10 {
+			mq.saveTaskState(task)
+		}
 	} else {
 		glog.V(2).Infof("Progress update for unknown task: %s (%.1f%%)", taskID, progress)
 	}
@@ -489,9 +666,19 @@ func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {

 	for id, worker := range mq.workers {
 		if worker.LastHeartbeat.Before(cutoff) {
-			// Mark any assigned tasks as failed
+			// Mark any assigned tasks as failed and record unassignment
 			for _, task := range mq.tasks {
 				if task.WorkerID == id && (task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress) {
+					// Record unassignment due to worker becoming unavailable
+					if len(task.AssignmentHistory) > 0 {
+						lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
+						if lastAssignment.UnassignedAt == nil {
+							unassignedTime := time.Now()
+							lastAssignment.UnassignedAt = &unassignedTime
+							lastAssignment.Reason = "Worker became unavailable (stale heartbeat)"
+						}
+					}
+
 					task.Status = TaskStatusFailed
 					task.Error = "Worker became unavailable"
 					completedTime := time.Now()
@@ -600,7 +787,10 @@ func (mq *MaintenanceQueue) canExecuteTaskType(taskType MaintenanceTaskType) boo
 	runningCount := mq.GetRunningTaskCount(taskType)
 	maxConcurrent := mq.getMaxConcurrentForTaskType(taskType)

-	return runningCount < maxConcurrent
+	canExecute := runningCount < maxConcurrent
+	glog.V(3).Infof("canExecuteTaskType for %s: running=%d, max=%d, canExecute=%v", taskType, runningCount, maxConcurrent, canExecute)
+
+	return canExecute
 }

 // getMaxConcurrentForTaskType returns the maximum concurrent tasks allowed for a task type
@@ -684,40 +874,28 @@ func (mq *MaintenanceQueue) trackPendingOperation(task *MaintenanceTask) {
 		opType = OpTypeVolumeMove
 	}

-	// Determine destination node and estimated size from typed parameters
+	// Determine destination node and estimated size from unified targets
 	destNode := ""
 	estimatedSize := uint64(1024 * 1024 * 1024) // Default 1GB estimate

-	switch params := task.TypedParams.TaskParams.(type) {
-	case *worker_pb.TaskParams_ErasureCodingParams:
-		if params.ErasureCodingParams != nil {
-			if len(params.ErasureCodingParams.Destinations) > 0 {
-				destNode = params.ErasureCodingParams.Destinations[0].Node
-			}
-			if params.ErasureCodingParams.EstimatedShardSize > 0 {
-				estimatedSize = params.ErasureCodingParams.EstimatedShardSize
-			}
-		}
-	case *worker_pb.TaskParams_BalanceParams:
-		if params.BalanceParams != nil {
-			destNode = params.BalanceParams.DestNode
-			if params.BalanceParams.EstimatedSize > 0 {
-				estimatedSize = params.BalanceParams.EstimatedSize
-			}
-		}
-	case *worker_pb.TaskParams_ReplicationParams:
-		if params.ReplicationParams != nil {
-			destNode = params.ReplicationParams.DestNode
-			if params.ReplicationParams.EstimatedSize > 0 {
-				estimatedSize = params.ReplicationParams.EstimatedSize
-			}
+	// Use unified targets array - the only source of truth
+	if len(task.TypedParams.Targets) > 0 {
+		destNode = task.TypedParams.Targets[0].Node
+		if task.TypedParams.Targets[0].EstimatedSize > 0 {
+			estimatedSize = task.TypedParams.Targets[0].EstimatedSize
 		}
 	}

+	// Determine source node from unified sources
+	sourceNode := ""
+	if len(task.TypedParams.Sources) > 0 {
+		sourceNode = task.TypedParams.Sources[0].Node
+	}
+
 	operation := &PendingOperation{
 		VolumeID:      task.VolumeID,
 		OperationType: opType,
-		SourceNode:    task.Server,
+		SourceNode:    sourceNode,
 		DestNode:      destNode,
 		TaskID:        task.ID,
 		StartTime:     time.Now(),
--- a/weed/admin/maintenance/maintenance_scanner.go
+++ b/weed/admin/maintenance/maintenance_scanner.go
@@ -117,6 +117,8 @@ func (ms *MaintenanceScanner) getVolumeHealthMetrics() ([]*VolumeHealthMetrics,
 								Server:           node.Id,
 								DiskType:         diskType,       // Track which disk this volume is on
 								DiskId:           volInfo.DiskId, // Use disk ID from volume info
+								DataCenter:       dc.Id,          // Data center from current loop
+								Rack:             rack.Id,        // Rack from current loop
 								Collection:       volInfo.Collection,
 								Size:             volInfo.Size,
 								DeletedBytes:     volInfo.DeletedByteCount,
@@ -207,6 +209,8 @@ func (ms *MaintenanceScanner) convertToTaskMetrics(metrics []*VolumeHealthMetric
 			Server:           metric.Server,
 			DiskType:         metric.DiskType,
 			DiskId:           metric.DiskId,
+			DataCenter:       metric.DataCenter,
+			Rack:             metric.Rack,
 			Collection:       metric.Collection,
 			Size:             metric.Size,
 			DeletedBytes:     metric.DeletedBytes,
--- a/weed/admin/maintenance/maintenance_types.go
+++ b/weed/admin/maintenance/maintenance_types.go
@@ -108,6 +108,57 @@ type MaintenanceTask struct {
 	Progress    float64                 `json:"progress"` // 0-100
 	RetryCount  int                     `json:"retry_count"`
 	MaxRetries  int                     `json:"max_retries"`
+
+	// Enhanced fields for detailed task tracking
+	CreatedBy         string                  `json:"created_by,omitempty"`         // Who/what created this task
+	CreationContext   string                  `json:"creation_context,omitempty"`   // Additional context about creation
+	AssignmentHistory []*TaskAssignmentRecord `json:"assignment_history,omitempty"` // History of worker assignments
+	DetailedReason    string                  `json:"detailed_reason,omitempty"`    // More detailed explanation than Reason
+	Tags              map[string]string       `json:"tags,omitempty"`               // Additional metadata tags
+}
+
+// TaskAssignmentRecord tracks when a task was assigned to a worker
+type TaskAssignmentRecord struct {
+	WorkerID      string     `json:"worker_id"`
+	WorkerAddress string     `json:"worker_address"`
+	AssignedAt    time.Time  `json:"assigned_at"`
+	UnassignedAt  *time.Time `json:"unassigned_at,omitempty"`
+	Reason        string     `json:"reason"` // Why was it assigned/unassigned
+}
+
+// TaskExecutionLog represents a log entry from task execution
+type TaskExecutionLog struct {
+	Timestamp time.Time `json:"timestamp"`
+	Level     string    `json:"level"` // "info", "warn", "error", "debug"
+	Message   string    `json:"message"`
+	Source    string    `json:"source"` // Which component logged this
+	TaskID    string    `json:"task_id"`
+	WorkerID  string    `json:"worker_id"`
+	// Optional structured fields carried from worker logs
+	Fields map[string]string `json:"fields,omitempty"`
+	// Optional progress/status carried from worker logs
+	Progress *float64 `json:"progress,omitempty"`
+	Status   string   `json:"status,omitempty"`
+}
+
+// TaskDetailData represents comprehensive information about a task for the detail view
+type TaskDetailData struct {
+	Task              *MaintenanceTask        `json:"task"`
+	AssignmentHistory []*TaskAssignmentRecord `json:"assignment_history"`
+	ExecutionLogs     []*TaskExecutionLog     `json:"execution_logs"`
+	RelatedTasks      []*MaintenanceTask      `json:"related_tasks,omitempty"`    // Other tasks on same volume/server
+	WorkerInfo        *MaintenanceWorker      `json:"worker_info,omitempty"`      // Current or last assigned worker
+	CreationMetrics   *TaskCreationMetrics    `json:"creation_metrics,omitempty"` // Metrics that led to task creation
+	LastUpdated       time.Time               `json:"last_updated"`
+}
+
+// TaskCreationMetrics holds metrics that led to the task being created
+type TaskCreationMetrics struct {
+	TriggerMetric  string                 `json:"trigger_metric"` // What metric triggered this task
+	MetricValue    float64                `json:"metric_value"`   // Value of the trigger metric
+	Threshold      float64                `json:"threshold"`      // Threshold that was exceeded
+	VolumeMetrics  *VolumeHealthMetrics   `json:"volume_metrics,omitempty"`
+	AdditionalData map[string]interface{} `json:"additional_data,omitempty"`
 }

 // MaintenanceConfig holds configuration for the maintenance system
@@ -122,6 +173,15 @@ type MaintenancePolicy = worker_pb.MaintenancePolicy
 // DEPRECATED: Use worker_pb.TaskPolicy instead
 type TaskPolicy = worker_pb.TaskPolicy

+// TaskPersistence interface for task state persistence
+type TaskPersistence interface {
+	SaveTaskState(task *MaintenanceTask) error
+	LoadTaskState(taskID string) (*MaintenanceTask, error)
+	LoadAllTaskStates() ([]*MaintenanceTask, error)
+	DeleteTaskState(taskID string) error
+	CleanupCompletedTasks() error
+}
+
 // Default configuration values
 func DefaultMaintenanceConfig() *MaintenanceConfig {
 	return DefaultMaintenanceConfigProto()
@@ -273,6 +333,7 @@ type MaintenanceQueue struct {
 	mutex        sync.RWMutex
 	policy       *MaintenancePolicy
 	integration  *MaintenanceIntegration
+	persistence  TaskPersistence // Interface for task persistence
 }

 // MaintenanceScanner analyzes the cluster and generates maintenance tasks
@@ -301,8 +362,10 @@ type TaskDetectionResult struct {
 type VolumeHealthMetrics struct {
 	VolumeID         uint32        `json:"volume_id"`
 	Server           string        `json:"server"`
-	DiskType         string        `json:"disk_type"` // Disk type (e.g., "hdd", "ssd") or disk path (e.g., "/data1")
-	DiskId           uint32        `json:"disk_id"`   // ID of the disk in Store.Locations array
+	DiskType         string        `json:"disk_type"`   // Disk type (e.g., "hdd", "ssd") or disk path (e.g., "/data1")
+	DiskId           uint32        `json:"disk_id"`     // ID of the disk in Store.Locations array
+	DataCenter       string        `json:"data_center"` // Data center of the server
+	Rack             string        `json:"rack"`        // Rack of the server
 	Collection       string        `json:"collection"`
 	Size             uint64        `json:"size"`
 	DeletedBytes     uint64        `json:"deleted_bytes"`