Fix Maintenance Task Sorting and Refactor Log Persistence (#8199)

* fix float stepping

* do not auto refresh

* only logs when non 200 status

* fix maintenance task sorting and cleanup redundant handler logic

* Refactor log retrieval to persist to disk and fix slowness

- Move log retrieval to disk-based persistence in GetMaintenanceTaskDetail
- Implement background log fetching on task completion in worker_grpc_server.go
- Implement async background refresh for in-progress tasks
- Completely remove blocking gRPC calls from the UI path to fix 10s timeouts
- Cleanup debug logs and performance profiling code

* Ensure consistent deterministic sorting in config_persistence cleanup

* Replace magic numbers with constants and remove debug logs

- Added descriptive constants for truncation limits and timeouts in admin_server.go and worker_grpc_server.go
- Replaced magic numbers with these constants throughout the codebase
- Verified removal of stdout debug printing
- Ensured consistent truncation logic during log persistence

* Address code review feedback on history truncation and logging logic

- Fix AssignmentHistory double-serialization by copying task in GetMaintenanceTaskDetail
- Fix handleTaskCompletion logging logic (mutually exclusive success/failure logs)
- Remove unused Timeout field from LogRequestContext and sync select timeouts with constants
- Ensure AssignmentHistory is only provided in the top-level field for better JSON structure

* Implement goroutine leak protection and request deduplication

- Add request deduplication in RequestTaskLogs to prevent multiple concurrent fetches for the same task
- Implement safe cleanup in timeout handlers to avoid race conditions in pendingLogRequests map
- Add a 10s cooldown for background log refreshes in GetMaintenanceTaskDetail to prevent spamming
- Ensure all persistent log-fetching goroutines are bounded and efficiently managed

* Fix potential nil pointer panics in maintenance handlers

- Add nil checks for adminServer in ShowTaskDetail, ShowMaintenanceWorkers, and UpdateTaskConfig
- Update getMaintenanceQueueData to return a descriptive error instead of nil when adminServer is uninitialized
- Ensure internal helper methods consistently check for adminServer initialization before use

* Strictly enforce disk-only log reading

- Remove background log fetching from GetMaintenanceTaskDetail to prevent timeouts and network calls during page view
- Remove unused lastLogFetch tracking fields to clean up dead code
- Ensure logs are only updated upon task completion via handleTaskCompletion

* Refactor GetWorkerLogs to read from disk

- Update /api/maintenance/workers/:id/logs endpoint to use configPersistence.LoadTaskExecutionLogs
- Remove synchronous gRPC call RequestTaskLogs to prevent timeouts and bad gateway errors
- Ensure consistent log retrieval behavior across the application (disk-only)

* Fix timestamp parsing in log viewer

- Update task_detail.templ JS to handle both ISO 8601 strings and Unix timestamps
- Fix "Invalid time value" error when displaying logs fetched from disk
- Regenerate templates

* master: fallback to HDD if SSD volumes are full in Assign

* worker: improve EC detection logging and fix skip counters

* worker: add Sync method to TaskLogger interface

* worker: implement Sync and ensure logs are flushed before task completion

* admin: improve task log retrieval with retries and better timeouts

* admin: robust timestamp parsing in task detail view
This commit is contained in:
Chris Lu
2026-02-04 08:48:55 -08:00
committed by GitHub
parent 2ff1cd9fc9
commit 72a8f598f2
51 changed files with 499 additions and 241 deletions

View File

@@ -5,7 +5,6 @@ import (
"fmt"
"net/http"
"sort"
"strconv"
"strings"
"time"
@@ -33,6 +32,17 @@ import (
_ "github.com/seaweedfs/seaweedfs/weed/credential/grpc" // Register gRPC credential store
)
const (
maxAssignmentHistoryDisplay = 50
maxLogMessageLength = 2000
maxLogFields = 20
maxRelatedTasksDisplay = 50
maxRecentTasksDisplay = 10
defaultCacheTimeout = 10 * time.Second
defaultFilerCacheTimeout = 30 * time.Second
defaultStatsCacheTimeout = 30 * time.Second
)
// FilerConfig holds filer configuration needed for bucket operations
type FilerConfig struct {
BucketsPath string
@@ -132,10 +142,10 @@ func NewAdminServer(masters string, templateFS http.FileSystem, dataDir string,
templateFS: templateFS,
dataDir: dataDir,
grpcDialOption: grpcDialOption,
cacheExpiration: 10 * time.Second,
filerCacheExpiration: 30 * time.Second, // Cache filers for 30 seconds
cacheExpiration: defaultCacheTimeout,
filerCacheExpiration: defaultFilerCacheTimeout,
configPersistence: NewConfigPersistence(dataDir),
collectionStatsCacheThreshold: 30 * time.Second,
collectionStatsCacheThreshold: defaultStatsCacheTimeout,
s3TablesManager: newS3TablesManager(),
icebergPort: icebergPort,
}
@@ -779,7 +789,7 @@ func (s *AdminServer) GetClusterBrokers() (*ClusterBrokersData, error) {
// ShowMaintenanceQueue displays the maintenance queue page
func (as *AdminServer) ShowMaintenanceQueue(c *gin.Context) {
data, err := as.getMaintenanceQueueData()
data, err := as.GetMaintenanceQueueData()
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
@@ -868,7 +878,7 @@ func (as *AdminServer) TriggerMaintenanceScan(c *gin.Context) {
// GetMaintenanceTasks returns all maintenance tasks
func (as *AdminServer) GetMaintenanceTasks(c *gin.Context) {
tasks, err := as.getMaintenanceTasks()
tasks, err := as.GetAllMaintenanceTasks()
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
@@ -1032,9 +1042,9 @@ func (as *AdminServer) UpdateMaintenanceConfigData(config *maintenance.Maintenan
// Helper methods for maintenance operations
// getMaintenanceQueueData returns data for the maintenance queue UI
func (as *AdminServer) getMaintenanceQueueData() (*maintenance.MaintenanceQueueData, error) {
tasks, err := as.getMaintenanceTasks()
// GetMaintenanceQueueData returns data for the maintenance queue UI
func (as *AdminServer) GetMaintenanceQueueData() (*maintenance.MaintenanceQueueData, error) {
tasks, err := as.GetAllMaintenanceTasks()
if err != nil {
return nil, err
}
@@ -1089,14 +1099,16 @@ func (as *AdminServer) getMaintenanceQueueStats() (*maintenance.QueueStats, erro
return queueStats, nil
}
// getMaintenanceTasks returns all maintenance tasks
func (as *AdminServer) getMaintenanceTasks() ([]*maintenance.MaintenanceTask, error) {
// GetAllMaintenanceTasks returns all maintenance tasks
func (as *AdminServer) GetAllMaintenanceTasks() ([]*maintenance.MaintenanceTask, error) {
if as.maintenanceManager == nil {
return []*maintenance.MaintenanceTask{}, nil
}
// Collect all tasks from memory across all statuses
allTasks := []*maintenance.MaintenanceTask{}
// 1. Collect all tasks from memory
tasksMap := make(map[string]*maintenance.MaintenanceTask)
// Collect from memory via GetTasks loop to ensure we catch everything
statuses := []maintenance.MaintenanceTaskStatus{
maintenance.TaskStatusPending,
maintenance.TaskStatusAssigned,
@@ -1108,29 +1120,92 @@ func (as *AdminServer) getMaintenanceTasks() ([]*maintenance.MaintenanceTask, er
for _, status := range statuses {
tasks := as.maintenanceManager.GetTasks(status, "", 0)
allTasks = append(allTasks, tasks...)
for _, t := range tasks {
tasksMap[t.ID] = t
}
}
// Also load any persisted tasks that might not be in memory
// 2. Merge persisted tasks
if as.configPersistence != nil {
persistedTasks, err := as.configPersistence.LoadAllTaskStates()
if err == nil {
// Add any persisted tasks not already in memory
for _, persistedTask := range persistedTasks {
found := false
for _, memoryTask := range allTasks {
if memoryTask.ID == persistedTask.ID {
found = true
break
}
}
if !found {
allTasks = append(allTasks, persistedTask)
for _, t := range persistedTasks {
if _, exists := tasksMap[t.ID]; !exists {
tasksMap[t.ID] = t
}
}
}
}
// 3. Bucketize buckets
var pendingTasks, activeTasks, finishedTasks []*maintenance.MaintenanceTask
for _, t := range tasksMap {
switch t.Status {
case maintenance.TaskStatusPending:
pendingTasks = append(pendingTasks, t)
case maintenance.TaskStatusAssigned, maintenance.TaskStatusInProgress:
activeTasks = append(activeTasks, t)
case maintenance.TaskStatusCompleted, maintenance.TaskStatusFailed, maintenance.TaskStatusCancelled:
finishedTasks = append(finishedTasks, t)
default:
// Treat unknown as finished/archived? Or pending?
// Safest to add to finished so they appear somewhere
finishedTasks = append(finishedTasks, t)
}
}
// 4. Sort buckets
// Pending: Newest Created First
sort.Slice(pendingTasks, func(i, j int) bool {
return pendingTasks[i].CreatedAt.After(pendingTasks[j].CreatedAt)
})
// Active: Newest Created First (or StartedAt?)
sort.Slice(activeTasks, func(i, j int) bool {
return activeTasks[i].CreatedAt.After(activeTasks[j].CreatedAt)
})
// Finished: Newest Completed First
sort.Slice(finishedTasks, func(i, j int) bool {
t1 := finishedTasks[i].CompletedAt
t2 := finishedTasks[j].CompletedAt
// Handle nil completion times
if t1 == nil && t2 == nil {
// Both nil, fallback to CreatedAt
if !finishedTasks[i].CreatedAt.Equal(finishedTasks[j].CreatedAt) {
return finishedTasks[i].CreatedAt.After(finishedTasks[j].CreatedAt)
}
return finishedTasks[i].ID > finishedTasks[j].ID
}
if t1 == nil {
return false // t1 (nil) goes to bottom
}
if t2 == nil {
return true // t2 (nil) goes to bottom
}
// Compare completion times
if !t1.Equal(*t2) {
return t1.After(*t2)
}
// Fallback to CreatedAt if completion times are identical
if !finishedTasks[i].CreatedAt.Equal(finishedTasks[j].CreatedAt) {
return finishedTasks[i].CreatedAt.After(finishedTasks[j].CreatedAt)
}
// Final tie-breaker: ID
return finishedTasks[i].ID > finishedTasks[j].ID
})
// 5. Recombine
allTasks := make([]*maintenance.MaintenanceTask, 0, len(tasksMap))
allTasks = append(allTasks, pendingTasks...)
allTasks = append(allTasks, activeTasks...)
allTasks = append(allTasks, finishedTasks...)
return allTasks, nil
}
@@ -1181,15 +1256,25 @@ func (as *AdminServer) GetMaintenanceTaskDetail(taskID string) (*maintenance.Tas
return nil, err
}
// Copy task and truncate assignment history for display
displayTask := *task
displayTask.AssignmentHistory = nil // History is provided separately in taskDetail
// Create task detail structure from the loaded task
taskDetail := &maintenance.TaskDetailData{
Task: task,
Task: &displayTask,
AssignmentHistory: task.AssignmentHistory, // Use assignment history from persisted task
ExecutionLogs: []*maintenance.TaskExecutionLog{},
RelatedTasks: []*maintenance.MaintenanceTask{},
LastUpdated: time.Now(),
}
// Truncate assignment history if it's too long (display last N only)
if len(taskDetail.AssignmentHistory) > maxAssignmentHistoryDisplay {
startIdx := len(taskDetail.AssignmentHistory) - maxAssignmentHistoryDisplay
taskDetail.AssignmentHistory = taskDetail.AssignmentHistory[startIdx:]
}
if taskDetail.AssignmentHistory == nil {
taskDetail.AssignmentHistory = []*maintenance.TaskAssignmentRecord{}
}
@@ -1205,72 +1290,19 @@ func (as *AdminServer) GetMaintenanceTaskDetail(taskID string) (*maintenance.Tas
}
}
// Get execution logs from worker if task is active/completed and worker is connected
if task.Status == maintenance.TaskStatusInProgress || task.Status == maintenance.TaskStatusCompleted {
if as.workerGrpcServer != nil && task.WorkerID != "" {
workerLogs, err := as.workerGrpcServer.RequestTaskLogs(task.WorkerID, taskID, 100, "")
if err == nil && len(workerLogs) > 0 {
// Convert worker logs to maintenance logs
for _, workerLog := range workerLogs {
maintenanceLog := &maintenance.TaskExecutionLog{
Timestamp: time.Unix(workerLog.Timestamp, 0),
Level: workerLog.Level,
Message: workerLog.Message,
Source: "worker",
TaskID: taskID,
WorkerID: task.WorkerID,
}
// carry structured fields if present
if len(workerLog.Fields) > 0 {
maintenanceLog.Fields = make(map[string]string, len(workerLog.Fields))
for k, v := range workerLog.Fields {
maintenanceLog.Fields[k] = v
}
}
// carry optional progress/status
if workerLog.Progress != 0 {
p := float64(workerLog.Progress)
maintenanceLog.Progress = &p
}
if workerLog.Status != "" {
maintenanceLog.Status = workerLog.Status
}
taskDetail.ExecutionLogs = append(taskDetail.ExecutionLogs, maintenanceLog)
}
} else if err != nil {
// Add a diagnostic log entry when worker logs cannot be retrieved
diagnosticLog := &maintenance.TaskExecutionLog{
Timestamp: time.Now(),
Level: "WARNING",
Message: fmt.Sprintf("Failed to retrieve worker logs: %v", err),
Source: "admin",
TaskID: taskID,
WorkerID: task.WorkerID,
}
taskDetail.ExecutionLogs = append(taskDetail.ExecutionLogs, diagnosticLog)
glog.V(1).Infof("Failed to get worker logs for task %s from worker %s: %v", taskID, task.WorkerID, err)
}
// Load execution logs from disk
if as.configPersistence != nil {
logs, err := as.configPersistence.LoadTaskExecutionLogs(taskID)
if err == nil {
taskDetail.ExecutionLogs = logs
} else {
// Add diagnostic information when worker is not available
reason := "worker gRPC server not available"
if task.WorkerID == "" {
reason = "no worker assigned to task"
}
diagnosticLog := &maintenance.TaskExecutionLog{
Timestamp: time.Now(),
Level: "INFO",
Message: fmt.Sprintf("Worker logs not available: %s", reason),
Source: "admin",
TaskID: taskID,
WorkerID: task.WorkerID,
}
taskDetail.ExecutionLogs = append(taskDetail.ExecutionLogs, diagnosticLog)
glog.V(2).Infof("No execution logs found on disk for task %s", taskID)
}
}
// Get related tasks (other tasks on same volume/server)
if task.VolumeID != 0 || task.Server != "" {
allTasks := as.maintenanceManager.GetTasks("", "", 50) // Get recent tasks
allTasks := as.maintenanceManager.GetTasks("", "", maxRelatedTasksDisplay) // Get recent tasks
for _, relatedTask := range allTasks {
if relatedTask.ID != taskID &&
(relatedTask.VolumeID == task.VolumeID || relatedTask.Server == task.Server) {
@@ -1324,7 +1356,7 @@ func (as *AdminServer) getMaintenanceWorkerDetails(workerID string) (*WorkerDeta
}
// Get recent tasks for this worker
recentTasks := as.maintenanceManager.GetTasks(TaskStatusCompleted, "", 10)
recentTasks := as.maintenanceManager.GetTasks(TaskStatusCompleted, "", maxRecentTasksDisplay)
var workerRecentTasks []*MaintenanceTask
for _, task := range recentTasks {
if task.WorkerID == workerID {
@@ -1336,12 +1368,13 @@ func (as *AdminServer) getMaintenanceWorkerDetails(workerID string) (*WorkerDeta
var totalDuration time.Duration
var completedTasks, failedTasks int
for _, task := range workerRecentTasks {
if task.Status == TaskStatusCompleted {
switch task.Status {
case TaskStatusCompleted:
completedTasks++
if task.StartedAt != nil && task.CompletedAt != nil {
totalDuration += task.CompletedAt.Sub(*task.StartedAt)
}
} else if task.Status == TaskStatusFailed {
case TaskStatusFailed:
failedTasks++
}
}
@@ -1370,31 +1403,29 @@ func (as *AdminServer) getMaintenanceWorkerDetails(workerID string) (*WorkerDeta
}, nil
}
// GetWorkerLogs fetches logs from a specific worker for a task
// GetWorkerLogs fetches logs from a specific worker for a task (now reads from disk)
func (as *AdminServer) GetWorkerLogs(c *gin.Context) {
workerID := c.Param("id")
taskID := c.Query("taskId")
maxEntriesStr := c.DefaultQuery("maxEntries", "100")
logLevel := c.DefaultQuery("logLevel", "")
maxEntries := int32(100)
if maxEntriesStr != "" {
if parsed, err := strconv.ParseInt(maxEntriesStr, 10, 32); err == nil {
maxEntries = int32(parsed)
}
}
if as.workerGrpcServer == nil {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "Worker gRPC server not available"})
// Check config persistence first
if as.configPersistence == nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Config persistence not available"})
return
}
logs, err := as.workerGrpcServer.RequestTaskLogs(workerID, taskID, maxEntries, logLevel)
// Load logs strictly from disk to avoid timeouts and network dependency
// This matches the behavior of the Task Detail page
logs, err := as.configPersistence.LoadTaskExecutionLogs(taskID)
if err != nil {
c.JSON(http.StatusBadGateway, gin.H{"error": fmt.Sprintf("Failed to get logs from worker: %v", err)})
return
glog.V(2).Infof("No execution logs found on disk for task %s: %v", taskID, err)
logs = []*maintenance.TaskExecutionLog{}
}
// Filter logs by workerID if strictly needed, but usually task logs are what we want
// The persistent logs struct (TaskExecutionLog) matches what the frontend expects for the detail view
// ensuring consistent display.
c.JSON(http.StatusOK, gin.H{"worker_id": workerID, "task_id": taskID, "logs": logs, "count": len(logs)})
}