Fix Maintenance Task Sorting and Refactor Log Persistence (#8199)
* fix float stepping * do not auto refresh * only logs when non 200 status * fix maintenance task sorting and cleanup redundant handler logic * Refactor log retrieval to persist to disk and fix slowness - Move log retrieval to disk-based persistence in GetMaintenanceTaskDetail - Implement background log fetching on task completion in worker_grpc_server.go - Implement async background refresh for in-progress tasks - Completely remove blocking gRPC calls from the UI path to fix 10s timeouts - Cleanup debug logs and performance profiling code * Ensure consistent deterministic sorting in config_persistence cleanup * Replace magic numbers with constants and remove debug logs - Added descriptive constants for truncation limits and timeouts in admin_server.go and worker_grpc_server.go - Replaced magic numbers with these constants throughout the codebase - Verified removal of stdout debug printing - Ensured consistent truncation logic during log persistence * Address code review feedback on history truncation and logging logic - Fix AssignmentHistory double-serialization by copying task in GetMaintenanceTaskDetail - Fix handleTaskCompletion logging logic (mutually exclusive success/failure logs) - Remove unused Timeout field from LogRequestContext and sync select timeouts with constants - Ensure AssignmentHistory is only provided in the top-level field for better JSON structure * Implement goroutine leak protection and request deduplication - Add request deduplication in RequestTaskLogs to prevent multiple concurrent fetches for the same task - Implement safe cleanup in timeout handlers to avoid race conditions in pendingLogRequests map - Add a 10s cooldown for background log refreshes in GetMaintenanceTaskDetail to prevent spamming - Ensure all persistent log-fetching goroutines are bounded and efficiently managed * Fix potential nil pointer panics in maintenance handlers - Add nil checks for adminServer in ShowTaskDetail, ShowMaintenanceWorkers, and UpdateTaskConfig - Update getMaintenanceQueueData to return a descriptive error instead of nil when adminServer is uninitialized - Ensure internal helper methods consistently check for adminServer initialization before use * Strictly enforce disk-only log reading - Remove background log fetching from GetMaintenanceTaskDetail to prevent timeouts and network calls during page view - Remove unused lastLogFetch tracking fields to clean up dead code - Ensure logs are only updated upon task completion via handleTaskCompletion * Refactor GetWorkerLogs to read from disk - Update /api/maintenance/workers/:id/logs endpoint to use configPersistence.LoadTaskExecutionLogs - Remove synchronous gRPC call RequestTaskLogs to prevent timeouts and bad gateway errors - Ensure consistent log retrieval behavior across the application (disk-only) * Fix timestamp parsing in log viewer - Update task_detail.templ JS to handle both ISO 8601 strings and Unix timestamps - Fix "Invalid time value" error when displaying logs fetched from disk - Regenerate templates * master: fallback to HDD if SSD volumes are full in Assign * worker: improve EC detection logging and fix skip counters * worker: add Sync method to TaskLogger interface * worker: implement Sync and ensure logs are flushed before task completion * admin: improve task log retrieval with retries and better timeouts * admin: robust timestamp parsing in task detail view
This commit is contained in:
@@ -5,7 +5,6 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -33,6 +32,17 @@ import (
|
||||
_ "github.com/seaweedfs/seaweedfs/weed/credential/grpc" // Register gRPC credential store
|
||||
)
|
||||
|
||||
const (
|
||||
maxAssignmentHistoryDisplay = 50
|
||||
maxLogMessageLength = 2000
|
||||
maxLogFields = 20
|
||||
maxRelatedTasksDisplay = 50
|
||||
maxRecentTasksDisplay = 10
|
||||
defaultCacheTimeout = 10 * time.Second
|
||||
defaultFilerCacheTimeout = 30 * time.Second
|
||||
defaultStatsCacheTimeout = 30 * time.Second
|
||||
)
|
||||
|
||||
// FilerConfig holds filer configuration needed for bucket operations
|
||||
type FilerConfig struct {
|
||||
BucketsPath string
|
||||
@@ -132,10 +142,10 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
|
||||
templateFS: templateFS,
|
||||
dataDir: dataDir,
|
||||
grpcDialOption: grpcDialOption,
|
||||
cacheExpiration: 10 * time.Second,
|
||||
filerCacheExpiration: 30 * time.Second, // Cache filers for 30 seconds
|
||||
cacheExpiration: defaultCacheTimeout,
|
||||
filerCacheExpiration: defaultFilerCacheTimeout,
|
||||
configPersistence: NewConfigPersistence(dataDir),
|
||||
collectionStatsCacheThreshold: 30 * time.Second,
|
||||
collectionStatsCacheThreshold: defaultStatsCacheTimeout,
|
||||
s3TablesManager: newS3TablesManager(),
|
||||
icebergPort: icebergPort,
|
||||
}
|
||||
@@ -779,7 +789,7 @@ func (s *AdminServer) GetClusterBrokers() (*ClusterBrokersData, error) {
|
||||
|
||||
// ShowMaintenanceQueue displays the maintenance queue page
|
||||
func (as *AdminServer) ShowMaintenanceQueue(c *gin.Context) {
|
||||
data, err := as.getMaintenanceQueueData()
|
||||
data, err := as.GetMaintenanceQueueData()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -868,7 +878,7 @@ func (as *AdminServer) TriggerMaintenanceScan(c *gin.Context) {
|
||||
|
||||
// GetMaintenanceTasks returns all maintenance tasks
|
||||
func (as *AdminServer) GetMaintenanceTasks(c *gin.Context) {
|
||||
tasks, err := as.getMaintenanceTasks()
|
||||
tasks, err := as.GetAllMaintenanceTasks()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
@@ -1032,9 +1042,9 @@ func (as *AdminServer) UpdateMaintenanceConfigData(config *maintenance.Maintenan
|
||||
|
||||
// Helper methods for maintenance operations
|
||||
|
||||
// getMaintenanceQueueData returns data for the maintenance queue UI
|
||||
func (as *AdminServer) getMaintenanceQueueData() (*maintenance.MaintenanceQueueData, error) {
|
||||
tasks, err := as.getMaintenanceTasks()
|
||||
// GetMaintenanceQueueData returns data for the maintenance queue UI
|
||||
func (as *AdminServer) GetMaintenanceQueueData() (*maintenance.MaintenanceQueueData, error) {
|
||||
tasks, err := as.GetAllMaintenanceTasks()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1089,14 +1099,16 @@ func (as *AdminServer) getMaintenanceQueueStats() (*maintenance.QueueStats, erro
|
||||
return queueStats, nil
|
||||
}
|
||||
|
||||
// getMaintenanceTasks returns all maintenance tasks
|
||||
func (as *AdminServer) getMaintenanceTasks() ([]*maintenance.MaintenanceTask, error) {
|
||||
// GetAllMaintenanceTasks returns all maintenance tasks
|
||||
func (as *AdminServer) GetAllMaintenanceTasks() ([]*maintenance.MaintenanceTask, error) {
|
||||
if as.maintenanceManager == nil {
|
||||
return []*maintenance.MaintenanceTask{}, nil
|
||||
}
|
||||
|
||||
// Collect all tasks from memory across all statuses
|
||||
allTasks := []*maintenance.MaintenanceTask{}
|
||||
// 1. Collect all tasks from memory
|
||||
tasksMap := make(map[string]*maintenance.MaintenanceTask)
|
||||
|
||||
// Collect from memory via GetTasks loop to ensure we catch everything
|
||||
statuses := []maintenance.MaintenanceTaskStatus{
|
||||
maintenance.TaskStatusPending,
|
||||
maintenance.TaskStatusAssigned,
|
||||
@@ -1108,29 +1120,92 @@ func (as *AdminServer) getMaintenanceTasks() ([]*maintenance.MaintenanceTask, er
|
||||
|
||||
for _, status := range statuses {
|
||||
tasks := as.maintenanceManager.GetTasks(status, "", 0)
|
||||
allTasks = append(allTasks, tasks...)
|
||||
for _, t := range tasks {
|
||||
tasksMap[t.ID] = t
|
||||
}
|
||||
}
|
||||
|
||||
// Also load any persisted tasks that might not be in memory
|
||||
// 2. Merge persisted tasks
|
||||
if as.configPersistence != nil {
|
||||
persistedTasks, err := as.configPersistence.LoadAllTaskStates()
|
||||
if err == nil {
|
||||
// Add any persisted tasks not already in memory
|
||||
for _, persistedTask := range persistedTasks {
|
||||
found := false
|
||||
for _, memoryTask := range allTasks {
|
||||
if memoryTask.ID == persistedTask.ID {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
allTasks = append(allTasks, persistedTask)
|
||||
for _, t := range persistedTasks {
|
||||
if _, exists := tasksMap[t.ID]; !exists {
|
||||
tasksMap[t.ID] = t
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Bucketize buckets
|
||||
var pendingTasks, activeTasks, finishedTasks []*maintenance.MaintenanceTask
|
||||
|
||||
for _, t := range tasksMap {
|
||||
switch t.Status {
|
||||
case maintenance.TaskStatusPending:
|
||||
pendingTasks = append(pendingTasks, t)
|
||||
case maintenance.TaskStatusAssigned, maintenance.TaskStatusInProgress:
|
||||
activeTasks = append(activeTasks, t)
|
||||
case maintenance.TaskStatusCompleted, maintenance.TaskStatusFailed, maintenance.TaskStatusCancelled:
|
||||
finishedTasks = append(finishedTasks, t)
|
||||
default:
|
||||
// Treat unknown as finished/archived? Or pending?
|
||||
// Safest to add to finished so they appear somewhere
|
||||
finishedTasks = append(finishedTasks, t)
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Sort buckets
|
||||
// Pending: Newest Created First
|
||||
sort.Slice(pendingTasks, func(i, j int) bool {
|
||||
return pendingTasks[i].CreatedAt.After(pendingTasks[j].CreatedAt)
|
||||
})
|
||||
|
||||
// Active: Newest Created First (or StartedAt?)
|
||||
sort.Slice(activeTasks, func(i, j int) bool {
|
||||
return activeTasks[i].CreatedAt.After(activeTasks[j].CreatedAt)
|
||||
})
|
||||
|
||||
// Finished: Newest Completed First
|
||||
sort.Slice(finishedTasks, func(i, j int) bool {
|
||||
t1 := finishedTasks[i].CompletedAt
|
||||
t2 := finishedTasks[j].CompletedAt
|
||||
|
||||
// Handle nil completion times
|
||||
if t1 == nil && t2 == nil {
|
||||
// Both nil, fallback to CreatedAt
|
||||
if !finishedTasks[i].CreatedAt.Equal(finishedTasks[j].CreatedAt) {
|
||||
return finishedTasks[i].CreatedAt.After(finishedTasks[j].CreatedAt)
|
||||
}
|
||||
return finishedTasks[i].ID > finishedTasks[j].ID
|
||||
}
|
||||
if t1 == nil {
|
||||
return false // t1 (nil) goes to bottom
|
||||
}
|
||||
if t2 == nil {
|
||||
return true // t2 (nil) goes to bottom
|
||||
}
|
||||
|
||||
// Compare completion times
|
||||
if !t1.Equal(*t2) {
|
||||
return t1.After(*t2)
|
||||
}
|
||||
|
||||
// Fallback to CreatedAt if completion times are identical
|
||||
if !finishedTasks[i].CreatedAt.Equal(finishedTasks[j].CreatedAt) {
|
||||
return finishedTasks[i].CreatedAt.After(finishedTasks[j].CreatedAt)
|
||||
}
|
||||
|
||||
// Final tie-breaker: ID
|
||||
return finishedTasks[i].ID > finishedTasks[j].ID
|
||||
})
|
||||
|
||||
// 5. Recombine
|
||||
allTasks := make([]*maintenance.MaintenanceTask, 0, len(tasksMap))
|
||||
allTasks = append(allTasks, pendingTasks...)
|
||||
allTasks = append(allTasks, activeTasks...)
|
||||
allTasks = append(allTasks, finishedTasks...)
|
||||
|
||||
return allTasks, nil
|
||||
}
|
||||
|
||||
@@ -1181,15 +1256,25 @@ func (as *AdminServer) GetMaintenanceTaskDetail(taskID string) (*maintenance.Tas
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Copy task and truncate assignment history for display
|
||||
displayTask := *task
|
||||
displayTask.AssignmentHistory = nil // History is provided separately in taskDetail
|
||||
|
||||
// Create task detail structure from the loaded task
|
||||
taskDetail := &maintenance.TaskDetailData{
|
||||
Task: task,
|
||||
Task: &displayTask,
|
||||
AssignmentHistory: task.AssignmentHistory, // Use assignment history from persisted task
|
||||
ExecutionLogs: []*maintenance.TaskExecutionLog{},
|
||||
RelatedTasks: []*maintenance.MaintenanceTask{},
|
||||
LastUpdated: time.Now(),
|
||||
}
|
||||
|
||||
// Truncate assignment history if it's too long (display last N only)
|
||||
if len(taskDetail.AssignmentHistory) > maxAssignmentHistoryDisplay {
|
||||
startIdx := len(taskDetail.AssignmentHistory) - maxAssignmentHistoryDisplay
|
||||
taskDetail.AssignmentHistory = taskDetail.AssignmentHistory[startIdx:]
|
||||
}
|
||||
|
||||
if taskDetail.AssignmentHistory == nil {
|
||||
taskDetail.AssignmentHistory = []*maintenance.TaskAssignmentRecord{}
|
||||
}
|
||||
@@ -1205,72 +1290,19 @@ func (as *AdminServer) GetMaintenanceTaskDetail(taskID string) (*maintenance.Tas
|
||||
}
|
||||
}
|
||||
|
||||
// Get execution logs from worker if task is active/completed and worker is connected
|
||||
if task.Status == maintenance.TaskStatusInProgress || task.Status == maintenance.TaskStatusCompleted {
|
||||
if as.workerGrpcServer != nil && task.WorkerID != "" {
|
||||
workerLogs, err := as.workerGrpcServer.RequestTaskLogs(task.WorkerID, taskID, 100, "")
|
||||
if err == nil && len(workerLogs) > 0 {
|
||||
// Convert worker logs to maintenance logs
|
||||
for _, workerLog := range workerLogs {
|
||||
maintenanceLog := &maintenance.TaskExecutionLog{
|
||||
Timestamp: time.Unix(workerLog.Timestamp, 0),
|
||||
Level: workerLog.Level,
|
||||
Message: workerLog.Message,
|
||||
Source: "worker",
|
||||
TaskID: taskID,
|
||||
WorkerID: task.WorkerID,
|
||||
}
|
||||
// carry structured fields if present
|
||||
if len(workerLog.Fields) > 0 {
|
||||
maintenanceLog.Fields = make(map[string]string, len(workerLog.Fields))
|
||||
for k, v := range workerLog.Fields {
|
||||
maintenanceLog.Fields[k] = v
|
||||
}
|
||||
}
|
||||
// carry optional progress/status
|
||||
if workerLog.Progress != 0 {
|
||||
p := float64(workerLog.Progress)
|
||||
maintenanceLog.Progress = &p
|
||||
}
|
||||
if workerLog.Status != "" {
|
||||
maintenanceLog.Status = workerLog.Status
|
||||
}
|
||||
taskDetail.ExecutionLogs = append(taskDetail.ExecutionLogs, maintenanceLog)
|
||||
}
|
||||
} else if err != nil {
|
||||
// Add a diagnostic log entry when worker logs cannot be retrieved
|
||||
diagnosticLog := &maintenance.TaskExecutionLog{
|
||||
Timestamp: time.Now(),
|
||||
Level: "WARNING",
|
||||
Message: fmt.Sprintf("Failed to retrieve worker logs: %v", err),
|
||||
Source: "admin",
|
||||
TaskID: taskID,
|
||||
WorkerID: task.WorkerID,
|
||||
}
|
||||
taskDetail.ExecutionLogs = append(taskDetail.ExecutionLogs, diagnosticLog)
|
||||
glog.V(1).Infof("Failed to get worker logs for task %s from worker %s: %v", taskID, task.WorkerID, err)
|
||||
}
|
||||
// Load execution logs from disk
|
||||
if as.configPersistence != nil {
|
||||
logs, err := as.configPersistence.LoadTaskExecutionLogs(taskID)
|
||||
if err == nil {
|
||||
taskDetail.ExecutionLogs = logs
|
||||
} else {
|
||||
// Add diagnostic information when worker is not available
|
||||
reason := "worker gRPC server not available"
|
||||
if task.WorkerID == "" {
|
||||
reason = "no worker assigned to task"
|
||||
}
|
||||
diagnosticLog := &maintenance.TaskExecutionLog{
|
||||
Timestamp: time.Now(),
|
||||
Level: "INFO",
|
||||
Message: fmt.Sprintf("Worker logs not available: %s", reason),
|
||||
Source: "admin",
|
||||
TaskID: taskID,
|
||||
WorkerID: task.WorkerID,
|
||||
}
|
||||
taskDetail.ExecutionLogs = append(taskDetail.ExecutionLogs, diagnosticLog)
|
||||
glog.V(2).Infof("No execution logs found on disk for task %s", taskID)
|
||||
}
|
||||
}
|
||||
|
||||
// Get related tasks (other tasks on same volume/server)
|
||||
if task.VolumeID != 0 || task.Server != "" {
|
||||
allTasks := as.maintenanceManager.GetTasks("", "", 50) // Get recent tasks
|
||||
allTasks := as.maintenanceManager.GetTasks("", "", maxRelatedTasksDisplay) // Get recent tasks
|
||||
for _, relatedTask := range allTasks {
|
||||
if relatedTask.ID != taskID &&
|
||||
(relatedTask.VolumeID == task.VolumeID || relatedTask.Server == task.Server) {
|
||||
@@ -1324,7 +1356,7 @@ func (as *AdminServer) getMaintenanceWorkerDetails(workerID string) (*WorkerDeta
|
||||
}
|
||||
|
||||
// Get recent tasks for this worker
|
||||
recentTasks := as.maintenanceManager.GetTasks(TaskStatusCompleted, "", 10)
|
||||
recentTasks := as.maintenanceManager.GetTasks(TaskStatusCompleted, "", maxRecentTasksDisplay)
|
||||
var workerRecentTasks []*MaintenanceTask
|
||||
for _, task := range recentTasks {
|
||||
if task.WorkerID == workerID {
|
||||
@@ -1336,12 +1368,13 @@ func (as *AdminServer) getMaintenanceWorkerDetails(workerID string) (*WorkerDeta
|
||||
var totalDuration time.Duration
|
||||
var completedTasks, failedTasks int
|
||||
for _, task := range workerRecentTasks {
|
||||
if task.Status == TaskStatusCompleted {
|
||||
switch task.Status {
|
||||
case TaskStatusCompleted:
|
||||
completedTasks++
|
||||
if task.StartedAt != nil && task.CompletedAt != nil {
|
||||
totalDuration += task.CompletedAt.Sub(*task.StartedAt)
|
||||
}
|
||||
} else if task.Status == TaskStatusFailed {
|
||||
case TaskStatusFailed:
|
||||
failedTasks++
|
||||
}
|
||||
}
|
||||
@@ -1370,31 +1403,29 @@ func (as *AdminServer) getMaintenanceWorkerDetails(workerID string) (*WorkerDeta
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetWorkerLogs fetches logs from a specific worker for a task
|
||||
// GetWorkerLogs fetches logs from a specific worker for a task (now reads from disk)
|
||||
func (as *AdminServer) GetWorkerLogs(c *gin.Context) {
|
||||
workerID := c.Param("id")
|
||||
taskID := c.Query("taskId")
|
||||
maxEntriesStr := c.DefaultQuery("maxEntries", "100")
|
||||
logLevel := c.DefaultQuery("logLevel", "")
|
||||
|
||||
maxEntries := int32(100)
|
||||
if maxEntriesStr != "" {
|
||||
if parsed, err := strconv.ParseInt(maxEntriesStr, 10, 32); err == nil {
|
||||
maxEntries = int32(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
if as.workerGrpcServer == nil {
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "Worker gRPC server not available"})
|
||||
// Check config persistence first
|
||||
if as.configPersistence == nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "Config persistence not available"})
|
||||
return
|
||||
}
|
||||
|
||||
logs, err := as.workerGrpcServer.RequestTaskLogs(workerID, taskID, maxEntries, logLevel)
|
||||
// Load logs strictly from disk to avoid timeouts and network dependency
|
||||
// This matches the behavior of the Task Detail page
|
||||
logs, err := as.configPersistence.LoadTaskExecutionLogs(taskID)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": fmt.Sprintf("Failed to get logs from worker: %v", err)})
|
||||
return
|
||||
glog.V(2).Infof("No execution logs found on disk for task %s: %v", taskID, err)
|
||||
logs = []*maintenance.TaskExecutionLog{}
|
||||
}
|
||||
|
||||
// Filter logs by workerID if strictly needed, but usually task logs are what we want
|
||||
// The persistent logs struct (TaskExecutionLog) matches what the frontend expects for the detail view
|
||||
// ensuring consistent display.
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"worker_id": workerID, "task_id": taskID, "logs": logs, "count": len(logs)})
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user