Admin UI: Fetch task logs (#7114)

* show task details

* loading tasks

* task UI works

* generic rendering

* rendering the export link

* removing placementConflicts from task parameters

* remove TaskSourceLocation

* remove "Server ID" column

* rendering balance task source

* sources and targets

* fix ec task generation

* move info

* render timeline

* simplified worker id

* simplify

* read task logs from worker

* isValidTaskID

* address comments

* Update weed/worker/tasks/balance/execution.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update weed/worker/tasks/erasure_coding/ec_task.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update weed/worker/tasks/task_log_handler.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix shard ids

* plan distributing shard id

* rendering planned shards in task details

* remove Conflicts

* worker logs correctly

* pass in dc and rack

* task logging

* Update weed/admin/maintenance/maintenance_queue.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* display log details

* logs have fields now

* sort field keys

* fix link

* fix collection filtering

* avoid hard coded ec shard counts

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Chris Lu
2025-08-09 21:47:29 -07:00
committed by GitHub
parent 3ac2a2e22d
commit 25bbf4c3d4
52 changed files with 7307 additions and 2004 deletions

View File

@@ -7,7 +7,6 @@ import (
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
)
// NewMaintenanceQueue creates a new maintenance queue
@@ -27,6 +26,102 @@ func (mq *MaintenanceQueue) SetIntegration(integration *MaintenanceIntegration)
glog.V(1).Infof("Maintenance queue configured with integration")
}
// SetPersistence sets the task persistence interface
func (mq *MaintenanceQueue) SetPersistence(persistence TaskPersistence) {
mq.persistence = persistence
glog.V(1).Infof("Maintenance queue configured with task persistence")
}
// LoadTasksFromPersistence loads tasks from persistent storage on startup
func (mq *MaintenanceQueue) LoadTasksFromPersistence() error {
if mq.persistence == nil {
glog.V(1).Infof("No task persistence configured, skipping task loading")
return nil
}
mq.mutex.Lock()
defer mq.mutex.Unlock()
glog.Infof("Loading tasks from persistence...")
tasks, err := mq.persistence.LoadAllTaskStates()
if err != nil {
return fmt.Errorf("failed to load task states: %w", err)
}
glog.Infof("DEBUG LoadTasksFromPersistence: Found %d tasks in persistence", len(tasks))
// Reset task maps
mq.tasks = make(map[string]*MaintenanceTask)
mq.pendingTasks = make([]*MaintenanceTask, 0)
// Load tasks by status
for _, task := range tasks {
glog.Infof("DEBUG LoadTasksFromPersistence: Loading task %s (type: %s, status: %s, scheduled: %v)", task.ID, task.Type, task.Status, task.ScheduledAt)
mq.tasks[task.ID] = task
switch task.Status {
case TaskStatusPending:
glog.Infof("DEBUG LoadTasksFromPersistence: Adding task %s to pending queue", task.ID)
mq.pendingTasks = append(mq.pendingTasks, task)
case TaskStatusAssigned, TaskStatusInProgress:
// For assigned/in-progress tasks, we need to check if the worker is still available
// If not, we should fail them and make them eligible for retry
if task.WorkerID != "" {
if _, exists := mq.workers[task.WorkerID]; !exists {
glog.Warningf("Task %s was assigned to unavailable worker %s, marking as failed", task.ID, task.WorkerID)
task.Status = TaskStatusFailed
task.Error = "Worker unavailable after restart"
completedTime := time.Now()
task.CompletedAt = &completedTime
// Check if it should be retried
if task.RetryCount < task.MaxRetries {
task.RetryCount++
task.Status = TaskStatusPending
task.WorkerID = ""
task.StartedAt = nil
task.CompletedAt = nil
task.Error = ""
task.ScheduledAt = time.Now().Add(1 * time.Minute) // Retry after restart delay
glog.Infof("DEBUG LoadTasksFromPersistence: Retrying task %s, adding to pending queue", task.ID)
mq.pendingTasks = append(mq.pendingTasks, task)
}
}
}
}
}
// Sort pending tasks by priority and schedule time
sort.Slice(mq.pendingTasks, func(i, j int) bool {
if mq.pendingTasks[i].Priority != mq.pendingTasks[j].Priority {
return mq.pendingTasks[i].Priority > mq.pendingTasks[j].Priority
}
return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
})
glog.Infof("Loaded %d tasks from persistence (%d pending)", len(tasks), len(mq.pendingTasks))
return nil
}
// saveTaskState saves a task to persistent storage
func (mq *MaintenanceQueue) saveTaskState(task *MaintenanceTask) {
if mq.persistence != nil {
if err := mq.persistence.SaveTaskState(task); err != nil {
glog.Errorf("Failed to save task state for %s: %v", task.ID, err)
}
}
}
// cleanupCompletedTasks removes old completed tasks beyond the retention limit
func (mq *MaintenanceQueue) cleanupCompletedTasks() {
if mq.persistence != nil {
if err := mq.persistence.CleanupCompletedTasks(); err != nil {
glog.Errorf("Failed to cleanup completed tasks: %v", err)
}
}
}
// AddTask adds a new maintenance task to the queue with deduplication
func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
mq.mutex.Lock()
@@ -44,6 +139,18 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
task.CreatedAt = time.Now()
task.MaxRetries = 3 // Default retry count
// Initialize assignment history and set creation context
task.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
if task.CreatedBy == "" {
task.CreatedBy = "maintenance-system"
}
if task.CreationContext == "" {
task.CreationContext = "Automatic task creation based on system monitoring"
}
if task.Tags == nil {
task.Tags = make(map[string]string)
}
mq.tasks[task.ID] = task
mq.pendingTasks = append(mq.pendingTasks, task)
@@ -55,6 +162,9 @@ func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
})
// Save task state to persistence
mq.saveTaskState(task)
scheduleInfo := ""
if !task.ScheduledAt.IsZero() && time.Until(task.ScheduledAt) > time.Minute {
scheduleInfo = fmt.Sprintf(", scheduled for %v", task.ScheduledAt.Format("15:04:05"))
@@ -143,7 +253,11 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
// Check if this task type needs a cooldown period
if !mq.canScheduleTaskNow(task) {
glog.V(3).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met", task.ID, task.Type, workerID)
// Add detailed diagnostic information
runningCount := mq.GetRunningTaskCount(task.Type)
maxConcurrent := mq.getMaxConcurrentForTaskType(task.Type)
glog.V(2).Infof("Task %s (%s) skipped for worker %s: scheduling constraints not met (running: %d, max: %d)",
task.ID, task.Type, workerID, runningCount, maxConcurrent)
continue
}
@@ -172,6 +286,26 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
return nil
}
// Record assignment history
workerAddress := ""
if worker, exists := mq.workers[workerID]; exists {
workerAddress = worker.Address
}
// Create assignment record
assignmentRecord := &TaskAssignmentRecord{
WorkerID: workerID,
WorkerAddress: workerAddress,
AssignedAt: now,
Reason: "Task assigned to available worker",
}
// Initialize assignment history if nil
if selectedTask.AssignmentHistory == nil {
selectedTask.AssignmentHistory = make([]*TaskAssignmentRecord, 0)
}
selectedTask.AssignmentHistory = append(selectedTask.AssignmentHistory, assignmentRecord)
// Assign the task
selectedTask.Status = TaskStatusAssigned
selectedTask.WorkerID = workerID
@@ -188,6 +322,9 @@ func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []Maintena
// Track pending operation
mq.trackPendingOperation(selectedTask)
// Save task state after assignment
mq.saveTaskState(selectedTask)
glog.Infof("Task assigned: %s (%s) → worker %s (volume %d, server %s)",
selectedTask.ID, selectedTask.Type, workerID, selectedTask.VolumeID, selectedTask.Server)
@@ -220,6 +357,17 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
// Check if task should be retried
if task.RetryCount < task.MaxRetries {
// Record unassignment due to failure/retry
if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
if lastAssignment.UnassignedAt == nil {
unassignedTime := completedTime
lastAssignment.UnassignedAt = &unassignedTime
lastAssignment.Reason = fmt.Sprintf("Task failed, scheduling retry (attempt %d/%d): %s",
task.RetryCount+1, task.MaxRetries, error)
}
}
task.RetryCount++
task.Status = TaskStatusPending
task.WorkerID = ""
@@ -229,15 +377,31 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay
mq.pendingTasks = append(mq.pendingTasks, task)
// Save task state after retry setup
mq.saveTaskState(task)
glog.Warningf("Task failed, scheduling retry: %s (%s) attempt %d/%d, worker %s, duration %v, error: %s",
taskID, task.Type, task.RetryCount, task.MaxRetries, task.WorkerID, duration, error)
} else {
// Record unassignment due to permanent failure
if task.WorkerID != "" && len(task.AssignmentHistory) > 0 {
lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
if lastAssignment.UnassignedAt == nil {
unassignedTime := completedTime
lastAssignment.UnassignedAt = &unassignedTime
lastAssignment.Reason = fmt.Sprintf("Task failed permanently after %d retries: %s", task.MaxRetries, error)
}
}
// Save task state after permanent failure
mq.saveTaskState(task)
glog.Errorf("Task failed permanently: %s (%s) worker %s, duration %v, after %d retries: %s",
taskID, task.Type, task.WorkerID, duration, task.MaxRetries, error)
}
} else {
task.Status = TaskStatusCompleted
task.Progress = 100
// Save task state after successful completion
mq.saveTaskState(task)
glog.Infof("Task completed: %s (%s) worker %s, duration %v, volume %d",
taskID, task.Type, task.WorkerID, duration, task.VolumeID)
}
@@ -257,6 +421,14 @@ func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
if task.Status != TaskStatusPending {
mq.removePendingOperation(taskID)
}
// Periodically cleanup old completed tasks (every 10th completion)
if task.Status == TaskStatusCompleted {
// Simple counter-based trigger for cleanup
if len(mq.tasks)%10 == 0 {
go mq.cleanupCompletedTasks()
}
}
}
// UpdateTaskProgress updates the progress of a running task
@@ -283,6 +455,11 @@ func (mq *MaintenanceQueue) UpdateTaskProgress(taskID string, progress float64)
glog.V(1).Infof("Task progress: %s (%s) worker %s, %.1f%% complete",
taskID, task.Type, task.WorkerID, progress)
}
// Save task state after progress update
if progress == 0 || progress >= 100 || progress-oldProgress >= 10 {
mq.saveTaskState(task)
}
} else {
glog.V(2).Infof("Progress update for unknown task: %s (%.1f%%)", taskID, progress)
}
@@ -489,9 +666,19 @@ func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {
for id, worker := range mq.workers {
if worker.LastHeartbeat.Before(cutoff) {
// Mark any assigned tasks as failed
// Mark any assigned tasks as failed and record unassignment
for _, task := range mq.tasks {
if task.WorkerID == id && (task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress) {
// Record unassignment due to worker becoming unavailable
if len(task.AssignmentHistory) > 0 {
lastAssignment := task.AssignmentHistory[len(task.AssignmentHistory)-1]
if lastAssignment.UnassignedAt == nil {
unassignedTime := time.Now()
lastAssignment.UnassignedAt = &unassignedTime
lastAssignment.Reason = "Worker became unavailable (stale heartbeat)"
}
}
task.Status = TaskStatusFailed
task.Error = "Worker became unavailable"
completedTime := time.Now()
@@ -600,7 +787,10 @@ func (mq *MaintenanceQueue) canExecuteTaskType(taskType MaintenanceTaskType) boo
runningCount := mq.GetRunningTaskCount(taskType)
maxConcurrent := mq.getMaxConcurrentForTaskType(taskType)
return runningCount < maxConcurrent
canExecute := runningCount < maxConcurrent
glog.V(3).Infof("canExecuteTaskType for %s: running=%d, max=%d, canExecute=%v", taskType, runningCount, maxConcurrent, canExecute)
return canExecute
}
// getMaxConcurrentForTaskType returns the maximum concurrent tasks allowed for a task type
@@ -684,40 +874,28 @@ func (mq *MaintenanceQueue) trackPendingOperation(task *MaintenanceTask) {
opType = OpTypeVolumeMove
}
// Determine destination node and estimated size from typed parameters
// Determine destination node and estimated size from unified targets
destNode := ""
estimatedSize := uint64(1024 * 1024 * 1024) // Default 1GB estimate
switch params := task.TypedParams.TaskParams.(type) {
case *worker_pb.TaskParams_ErasureCodingParams:
if params.ErasureCodingParams != nil {
if len(params.ErasureCodingParams.Destinations) > 0 {
destNode = params.ErasureCodingParams.Destinations[0].Node
}
if params.ErasureCodingParams.EstimatedShardSize > 0 {
estimatedSize = params.ErasureCodingParams.EstimatedShardSize
}
}
case *worker_pb.TaskParams_BalanceParams:
if params.BalanceParams != nil {
destNode = params.BalanceParams.DestNode
if params.BalanceParams.EstimatedSize > 0 {
estimatedSize = params.BalanceParams.EstimatedSize
}
}
case *worker_pb.TaskParams_ReplicationParams:
if params.ReplicationParams != nil {
destNode = params.ReplicationParams.DestNode
if params.ReplicationParams.EstimatedSize > 0 {
estimatedSize = params.ReplicationParams.EstimatedSize
}
// Use unified targets array - the only source of truth
if len(task.TypedParams.Targets) > 0 {
destNode = task.TypedParams.Targets[0].Node
if task.TypedParams.Targets[0].EstimatedSize > 0 {
estimatedSize = task.TypedParams.Targets[0].EstimatedSize
}
}
// Determine source node from unified sources
sourceNode := ""
if len(task.TypedParams.Sources) > 0 {
sourceNode = task.TypedParams.Sources[0].Node
}
operation := &PendingOperation{
VolumeID: task.VolumeID,
OperationType: opType,
SourceNode: task.Server,
SourceNode: sourceNode,
DestNode: destNode,
TaskID: task.ID,
StartTime: time.Now(),

View File

@@ -117,6 +117,8 @@ func (ms *MaintenanceScanner) getVolumeHealthMetrics() ([]*VolumeHealthMetrics,
Server: node.Id,
DiskType: diskType, // Track which disk this volume is on
DiskId: volInfo.DiskId, // Use disk ID from volume info
DataCenter: dc.Id, // Data center from current loop
Rack: rack.Id, // Rack from current loop
Collection: volInfo.Collection,
Size: volInfo.Size,
DeletedBytes: volInfo.DeletedByteCount,
@@ -207,6 +209,8 @@ func (ms *MaintenanceScanner) convertToTaskMetrics(metrics []*VolumeHealthMetric
Server: metric.Server,
DiskType: metric.DiskType,
DiskId: metric.DiskId,
DataCenter: metric.DataCenter,
Rack: metric.Rack,
Collection: metric.Collection,
Size: metric.Size,
DeletedBytes: metric.DeletedBytes,

View File

@@ -108,6 +108,57 @@ type MaintenanceTask struct {
Progress float64 `json:"progress"` // 0-100
RetryCount int `json:"retry_count"`
MaxRetries int `json:"max_retries"`
// Enhanced fields for detailed task tracking
CreatedBy string `json:"created_by,omitempty"` // Who/what created this task
CreationContext string `json:"creation_context,omitempty"` // Additional context about creation
AssignmentHistory []*TaskAssignmentRecord `json:"assignment_history,omitempty"` // History of worker assignments
DetailedReason string `json:"detailed_reason,omitempty"` // More detailed explanation than Reason
Tags map[string]string `json:"tags,omitempty"` // Additional metadata tags
}
// TaskAssignmentRecord tracks when a task was assigned to a worker
type TaskAssignmentRecord struct {
WorkerID string `json:"worker_id"`
WorkerAddress string `json:"worker_address"`
AssignedAt time.Time `json:"assigned_at"`
UnassignedAt *time.Time `json:"unassigned_at,omitempty"`
Reason string `json:"reason"` // Why was it assigned/unassigned
}
// TaskExecutionLog represents a log entry from task execution
type TaskExecutionLog struct {
Timestamp time.Time `json:"timestamp"`
Level string `json:"level"` // "info", "warn", "error", "debug"
Message string `json:"message"`
Source string `json:"source"` // Which component logged this
TaskID string `json:"task_id"`
WorkerID string `json:"worker_id"`
// Optional structured fields carried from worker logs
Fields map[string]string `json:"fields,omitempty"`
// Optional progress/status carried from worker logs
Progress *float64 `json:"progress,omitempty"`
Status string `json:"status,omitempty"`
}
// TaskDetailData represents comprehensive information about a task for the detail view
type TaskDetailData struct {
Task *MaintenanceTask `json:"task"`
AssignmentHistory []*TaskAssignmentRecord `json:"assignment_history"`
ExecutionLogs []*TaskExecutionLog `json:"execution_logs"`
RelatedTasks []*MaintenanceTask `json:"related_tasks,omitempty"` // Other tasks on same volume/server
WorkerInfo *MaintenanceWorker `json:"worker_info,omitempty"` // Current or last assigned worker
CreationMetrics *TaskCreationMetrics `json:"creation_metrics,omitempty"` // Metrics that led to task creation
LastUpdated time.Time `json:"last_updated"`
}
// TaskCreationMetrics holds metrics that led to the task being created
type TaskCreationMetrics struct {
TriggerMetric string `json:"trigger_metric"` // What metric triggered this task
MetricValue float64 `json:"metric_value"` // Value of the trigger metric
Threshold float64 `json:"threshold"` // Threshold that was exceeded
VolumeMetrics *VolumeHealthMetrics `json:"volume_metrics,omitempty"`
AdditionalData map[string]interface{} `json:"additional_data,omitempty"`
}
// MaintenanceConfig holds configuration for the maintenance system
@@ -122,6 +173,15 @@ type MaintenancePolicy = worker_pb.MaintenancePolicy
// DEPRECATED: Use worker_pb.TaskPolicy instead
type TaskPolicy = worker_pb.TaskPolicy
// TaskPersistence interface for task state persistence
type TaskPersistence interface {
SaveTaskState(task *MaintenanceTask) error
LoadTaskState(taskID string) (*MaintenanceTask, error)
LoadAllTaskStates() ([]*MaintenanceTask, error)
DeleteTaskState(taskID string) error
CleanupCompletedTasks() error
}
// Default configuration values
func DefaultMaintenanceConfig() *MaintenanceConfig {
return DefaultMaintenanceConfigProto()
@@ -273,6 +333,7 @@ type MaintenanceQueue struct {
mutex sync.RWMutex
policy *MaintenancePolicy
integration *MaintenanceIntegration
persistence TaskPersistence // Interface for task persistence
}
// MaintenanceScanner analyzes the cluster and generates maintenance tasks
@@ -301,8 +362,10 @@ type TaskDetectionResult struct {
type VolumeHealthMetrics struct {
VolumeID uint32 `json:"volume_id"`
Server string `json:"server"`
DiskType string `json:"disk_type"` // Disk type (e.g., "hdd", "ssd") or disk path (e.g., "/data1")
DiskId uint32 `json:"disk_id"` // ID of the disk in Store.Locations array
DiskType string `json:"disk_type"` // Disk type (e.g., "hdd", "ssd") or disk path (e.g., "/data1")
DiskId uint32 `json:"disk_id"` // ID of the disk in Store.Locations array
DataCenter string `json:"data_center"` // Data center of the server
Rack string `json:"rack"` // Rack of the server
Collection string `json:"collection"`
Size uint64 `json:"size"`
DeletedBytes uint64 `json:"deleted_bytes"`