Admin UI: Fetch task logs (#7114)
* show task details * loading tasks * task UI works * generic rendering * rendering the export link * removing placementConflicts from task parameters * remove TaskSourceLocation * remove "Server ID" column * rendering balance task source * sources and targets * fix ec task generation * move info * render timeline * simplified worker id * simplify * read task logs from worker * isValidTaskID * address comments * Update weed/worker/tasks/balance/execution.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update weed/worker/tasks/erasure_coding/ec_task.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update weed/worker/tasks/task_log_handler.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix shard ids * plan distributing shard id * rendering planned shards in task details * remove Conflicts * worker logs correctly * pass in dc and rack * task logging * Update weed/admin/maintenance/maintenance_queue.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * display log details * logs have fields now * sort field keys * fix link * fix collection filtering * avoid hard coded ec shard counts --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -7,7 +7,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
||||
)
|
||||
|
||||
// NewMaintenanceQueue creates a new maintenance queue
|
||||
@@ -27,6 +26,102 @@ func (mq *MaintenanceQueue) SetIntegration(integration *MaintenanceIntegration)
|
||||
glog.V(1).Infof("Maintenance queue configured with integration")
|
||||
}
|
||||
|
||||
// SetPersistence sets the task persistence interface
|
||||
func (mq *MaintenanceQueue) SetPersistence(persistence TaskPersistence) {
|
||||
mq.persistence = persistence
|
||||
glog.V(1).Infof("Maintenance queue configured with task persistence")
|
||||
}
|
||||
|
||||
// LoadTasksFromPersistence loads tasks from persistent storage on startup
|
||||
func (mq *MaintenanceQueue) LoadTasksFromPersistence() error {
|
||||
if mq.persistence == nil {
|
||||
glog.V(1).Infof("No task persistence configured, skipping task loading")
|
||||
return nil
|
||||
}
|
||||
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
glog.Infof("Loading tasks from persistence...")
|
||||
|
||||
tasks, err := mq.persistence.LoadAllTaskStates()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load task states: %w", err)
|
||||
}
|
||||
|
||||
glog.Infof("DEBUG LoadTasksFromPersistence: Found %d tasks in persistence", len(tasks))
|
||||
|
||||
// Reset task maps
|
||||
mq.tasks = make(map[string]*MaintenanceTask)
|
||||
mq.pendingTasks = make([]*MaintenanceTask, 0)
|
||||
|
||||
// Load tasks by status
|
||||
for _, task := range tasks {
|
||||
glog.Infof("DEBUG LoadTasksFromPersistence: Loading task %s (type: %s, status: %s, scheduled: %v)", task.ID, task.Type, task.Status, task.ScheduledAt)
|
||||
mq.tasks[task.ID] = task
|
||||
|
||||
switch task.Status {
|
||||
case TaskStatusPending:
|
||||
glog.Infof("DEBUG LoadTasksFromPersistence: Adding task %s to pending queue", task.ID)
|
||||
mq.pendingTasks = append(mq.pendingTasks, task)
|
||||
case TaskStatusAssigned, TaskStatusInProgress:
|
||||
// For assigned/in-progress tasks, we need to check if the worker is still available
|
||||
// If not, we should fail them and make them eligible for retry
|
||||
if task.WorkerID != "" {
|
||||
if _, exists := mq.workers[task.WorkerID]; !exists {
|
||||
glog.Warningf("Task %s was assigned to unavailable worker %s, marking as failed", task.ID, task.WorkerID)
|
||||
task.Status = TaskStatusFailed
|
||||
task.Error = "Worker unavailable after restart"
|
||||
completedTime := time.Now()
|
||||
task.CompletedAt = &completedTime
|
||||
|
||||
// Check if it should be retried
|
||||
if task.RetryCount < task.MaxRetries {
|
||||
task.RetryCount++
|
||||
task.Status = TaskStatusPending
|
||||
task.WorkerID = ""
|
||||
task.StartedAt = nil
|
||||
task.CompletedAt = nil
|
||||
task.Error = ""
|
||||
task.ScheduledAt = time.Now().Add(1 * time.Minute) // Retry after restart delay
|
||||
glog.Infof("DEBUG LoadTasksFromPersistence: Retrying task %s, adding to pending queue", task.ID)
|
||||
mq.pendingTasks = append(mq.pendingTasks, task)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort pending tasks by priority and schedule time
|
||||
sort.Slice(mq.pendingTasks, func(i, j int) bool {
|
||||
if mq.pendingTasks[i].Priority != mq.pendingTasks[j].Priority {
|
||||
return mq.pendingTasks[i].Priority > mq.pendingTasks[j].Priority
|
||||
}
|
||||
return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
|
||||
})
|
||||
|
||||
glog.Infof("Loaded %d tasks from persistence (%d pending)", len(tasks), len(mq.pendingTasks))
|
||||
return nil
|
||||
}
|
||||
|
||||
// saveTaskState saves a task to persistent storage
|
||||
func (mq *MaintenanceQueue) saveTaskState(task *MaintenanceTask) {
|
||||
if mq.persistence != nil {
|
||||
if err := mq.persistence.SaveTaskState(task); err != nil {
|
||||
glog.Errorf("Failed to save task state for %s: %v", task.ID, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupCompletedTasks removes old completed tasks beyond the retention limit
|
||||
func (mq *MaintenanceQueue) cleanupCompletedTasks() {
|
||||
if mq.persistence != nil {
|
||||
if err := mq.persistence.CleanupCompletedTasks(); err != nil {
|
||||
glog.Errorf("Failed to cleanup completed tasks: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// AddTask adds a new maintenance task to the queue with deduplication
|
||||
func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
|
||||
mq.mutex.Lock()
|
||||
@@ -44,6 +139,18 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
|
||||
task.CreatedAt = time.Now()
|
||||
task.MaxRetries = 3 // Default retry count
|
||||
|
||||
// Initialize assignment history and set creation context
|
||||
task.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
|
||||
if task.CreatedBy == "" {
|
||||
task.CreatedBy = "maintenance-system"
|
||||
}
|
||||
if task.CreationContext == "" {
|
||||
task.CreationContext = "Automatic task creation based on system monitoring"
|
||||
}
|
||||
if task.Tags == nil {
|
||||
task.Tags = make(map[string]string)
|
||||
}
|
||||
|
||||
mq.tasks[task.ID] = task
|
||||
mq.pendingTasks = append(mq.pendingTasks, task)
|
||||
|
||||
@@ -55,6 +162,9 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
|
||||
return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
|
||||
})
|
||||
|
||||
// Save task state to persistence
|
||||
mq.saveTaskState(task)
|
||||
|
||||
scheduleInfo := ""
|
||||
if !task.ScheduledAt.IsZero() && time.Until(task.ScheduledAt) > time.Minute {
|
||||
scheduleInfo = fmt.Sprintf(", scheduled for %v", task.ScheduledAt.Format("15:04:05"))
|
||||
@@ -143,7 +253,11 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
|
||||
|
||||
// Check if this task type needs a cooldown period
|
||||
if !mq.canScheduleTaskNow(task) {
|
||||
glog.V(3).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met", task.ID, task.Type, workerID)
|
||||
// Add detailed diagnostic information
|
||||
runningCount := mq.GetRunningTaskCount(task.Type)
|
||||
maxConcurrent := mq.getMaxConcurrentForTaskType(task.Type)
|
||||
glog.V(2).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met (running: %d, max: %d)",
|
||||
task.ID, task.Type, workerID, runningCount, maxConcurrent)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -172,6 +286,26 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
|
||||
return nil
|
||||
}
|
||||
|
||||
// Record assignment history
|
||||
workerAddress := ""
|
||||
if worker, exists := mq.workers[workerID]; exists {
|
||||
workerAddress = worker.Address
|
||||
}
|
||||
|
||||
// Create assignment record
|
||||
assignmentRecord := &TaskAssignmentRecord{
|
||||
WorkerID: workerID,
|
||||
WorkerAddress: workerAddress,
|
||||
AssignedAt: now,
|
||||
Reason: "Task assigned to available worker",
|
||||
}
|
||||
|
||||
// Initialize assignment history if nil
|
||||
if selectedTask.AssignmentHistory == nil {
|
||||
selectedTask.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
|
||||
}
|
||||
selectedTask.AssignmentHistory = append(selectedTask.AssignmentHistory, assignmentRecord)
|
||||
|
||||
// Assign the task
|
||||
selectedTask.Status = TaskStatusAssigned
|
||||
selectedTask.WorkerID = workerID
|
||||
@@ -188,6 +322,9 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
|
||||
// Track pending operation
|
||||
mq.trackPendingOperation(selectedTask)
|
||||
|
||||
// Save task state after assignment
|
||||
mq.saveTaskState(selectedTask)
|
||||
|
||||
glog.Infof("Task assigned: %s (%s) → worker %s (volume %d, server %s)",
|
||||
selectedTask.ID, selectedTask.Type, workerID, selectedTask.VolumeID, selectedTask.Server)
|
||||
|
||||
@@ -220,6 +357,17 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
|
||||
|
||||
// Check if task should be retried
|
||||
if task.RetryCount < task.MaxRetries {
|
||||
// Record unassignment due to failure/retry
|
||||
if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
|
||||
lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
|
||||
if lastAssignment.UnassignedAt == nil {
|
||||
unassignedTime := completedTime
|
||||
lastAssignment.UnassignedAt = &unassignedTime
|
||||
lastAssignment.Reason = fmt.Sprintf("Task failed, scheduling retry (attempt %d/%d): %s",
|
||||
task.RetryCount+1, task.MaxRetries, error)
|
||||
}
|
||||
}
|
||||
|
||||
task.RetryCount++
|
||||
task.Status = TaskStatusPending
|
||||
task.WorkerID = ""
|
||||
@@ -229,15 +377,31 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
|
||||
task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay
|
||||
|
||||
mq.pendingTasks = append(mq.pendingTasks, task)
|
||||
// Save task state after retry setup
|
||||
mq.saveTaskState(task)
|
||||
glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s",
|
||||
taskID, task.Type, task.RetryCount, task.MaxRetries, task.WorkerID, duration, error)
|
||||
} else {
|
||||
// Record unassignment due to permanent failure
|
||||
if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
|
||||
lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
|
||||
if lastAssignment.UnassignedAt == nil {
|
||||
unassignedTime := completedTime
|
||||
lastAssignment.UnassignedAt = &unassignedTime
|
||||
lastAssignment.Reason = fmt.Sprintf("Task failed permanently after %d retries: %s", task.MaxRetries, error)
|
||||
}
|
||||
}
|
||||
|
||||
// Save task state after permanent failure
|
||||
mq.saveTaskState(task)
|
||||
glog.Errorf("Task failed permanently: %s (%s) worker %s, duration %v, after %d retries: %s",
|
||||
taskID, task.Type, task.WorkerID, duration, task.MaxRetries, error)
|
||||
}
|
||||
} else {
|
||||
task.Status = TaskStatusCompleted
|
||||
task.Progress = 100
|
||||
// Save task state after successful completion
|
||||
mq.saveTaskState(task)
|
||||
glog.Infof("Task completed: %s (%s) worker %s, duration %v, volume %d",
|
||||
taskID, task.Type, task.WorkerID, duration, task.VolumeID)
|
||||
}
|
||||
@@ -257,6 +421,14 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
|
||||
if task.Status != TaskStatusPending {
|
||||
mq.removePendingOperation(taskID)
|
||||
}
|
||||
|
||||
// Periodically cleanup old completed tasks (every 10th completion)
|
||||
if task.Status == TaskStatusCompleted {
|
||||
// Simple counter-based trigger for cleanup
|
||||
if len(mq.tasks)%10 == 0 {
|
||||
go mq.cleanupCompletedTasks()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateTaskProgress updates the progress of a running task
|
||||
@@ -283,6 +455,11 @@ func (mq *MaintenanceQueue) UpdateTaskProgress(taskID string, progress float64)
|
||||
glog.V(1).Infof("Task progress: %s (%s) worker %s, %.1f%% complete",
|
||||
taskID, task.Type, task.WorkerID, progress)
|
||||
}
|
||||
|
||||
// Save task state after progress update
|
||||
if progress == 0 || progress >= 100 || progress-oldProgress >= 10 {
|
||||
mq.saveTaskState(task)
|
||||
}
|
||||
} else {
|
||||
glog.V(2).Infof("Progress update for unknown task: %s (%.1f%%)", taskID, progress)
|
||||
}
|
||||
@@ -489,9 +666,19 @@ func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {
|
||||
|
||||
for id, worker := range mq.workers {
|
||||
if worker.LastHeartbeat.Before(cutoff) {
|
||||
// Mark any assigned tasks as failed
|
||||
// Mark any assigned tasks as failed and record unassignment
|
||||
for _, task := range mq.tasks {
|
||||
if task.WorkerID == id && (task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress) {
|
||||
// Record unassignment due to worker becoming unavailable
|
||||
if len(task.AssignmentHistory) > 0 {
|
||||
lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
|
||||
if lastAssignment.UnassignedAt == nil {
|
||||
unassignedTime := time.Now()
|
||||
lastAssignment.UnassignedAt = &unassignedTime
|
||||
lastAssignment.Reason = "Worker became unavailable (stale heartbeat)"
|
||||
}
|
||||
}
|
||||
|
||||
task.Status = TaskStatusFailed
|
||||
task.Error = "Worker became unavailable"
|
||||
completedTime := time.Now()
|
||||
@@ -600,7 +787,10 @@ func (mq *MaintenanceQueue) canExecuteTaskType(taskType MaintenanceTaskType) boo
|
||||
runningCount := mq.GetRunningTaskCount(taskType)
|
||||
maxConcurrent := mq.getMaxConcurrentForTaskType(taskType)
|
||||
|
||||
return runningCount < maxConcurrent
|
||||
canExecute := runningCount < maxConcurrent
|
||||
glog.V(3).Infof("canExecuteTaskType for %s: running=%d, max=%d, canExecute=%v", taskType, runningCount, maxConcurrent, canExecute)
|
||||
|
||||
return canExecute
|
||||
}
|
||||
|
||||
// getMaxConcurrentForTaskType returns the maximum concurrent tasks allowed for a task type
|
||||
@@ -684,40 +874,28 @@ func (mq *MaintenanceQueue) trackPendingOperation(task *MaintenanceTask) {
|
||||
opType = OpTypeVolumeMove
|
||||
}
|
||||
|
||||
// Determine destination node and estimated size from typed parameters
|
||||
// Determine destination node and estimated size from unified targets
|
||||
destNode := ""
|
||||
estimatedSize := uint64(1024 * 1024 * 1024) // Default 1GB estimate
|
||||
|
||||
switch params := task.TypedParams.TaskParams.(type) {
|
||||
case *worker_pb.TaskParams_ErasureCodingParams:
|
||||
if params.ErasureCodingParams != nil {
|
||||
if len(params.ErasureCodingParams.Destinations) > 0 {
|
||||
destNode = params.ErasureCodingParams.Destinations[0].Node
|
||||
}
|
||||
if params.ErasureCodingParams.EstimatedShardSize > 0 {
|
||||
estimatedSize = params.ErasureCodingParams.EstimatedShardSize
|
||||
}
|
||||
}
|
||||
case *worker_pb.TaskParams_BalanceParams:
|
||||
if params.BalanceParams != nil {
|
||||
destNode = params.BalanceParams.DestNode
|
||||
if params.BalanceParams.EstimatedSize > 0 {
|
||||
estimatedSize = params.BalanceParams.EstimatedSize
|
||||
}
|
||||
}
|
||||
case *worker_pb.TaskParams_ReplicationParams:
|
||||
if params.ReplicationParams != nil {
|
||||
destNode = params.ReplicationParams.DestNode
|
||||
if params.ReplicationParams.EstimatedSize > 0 {
|
||||
estimatedSize = params.ReplicationParams.EstimatedSize
|
||||
}
|
||||
// Use unified targets array - the only source of truth
|
||||
if len(task.TypedParams.Targets) > 0 {
|
||||
destNode = task.TypedParams.Targets[0].Node
|
||||
if task.TypedParams.Targets[0].EstimatedSize > 0 {
|
||||
estimatedSize = task.TypedParams.Targets[0].EstimatedSize
|
||||
}
|
||||
}
|
||||
|
||||
// Determine source node from unified sources
|
||||
sourceNode := ""
|
||||
if len(task.TypedParams.Sources) > 0 {
|
||||
sourceNode = task.TypedParams.Sources[0].Node
|
||||
}
|
||||
|
||||
operation := &PendingOperation{
|
||||
VolumeID: task.VolumeID,
|
||||
OperationType: opType,
|
||||
SourceNode: task.Server,
|
||||
SourceNode: sourceNode,
|
||||
DestNode: destNode,
|
||||
TaskID: task.ID,
|
||||
StartTime: time.Now(),
|
||||
|
||||
Reference in New Issue
Block a user