Fix Maintenance Task Sorting and Refactor Log Persistence (#8199)

* fix float stepping * do not auto refresh * only logs when non 200 status * fix maintenance task sorting and cleanup redundant handler logic * Refactor log retrieval to persist to disk and fix slowness - Move log retrieval to disk-based persistence in GetMaintenanceTaskDetail - Implement background log fetching on task completion in worker_grpc_server.go - Implement async background refresh for in-progress tasks - Completely remove blocking gRPC calls from the UI path to fix 10s timeouts - Cleanup debug logs and performance profiling code * Ensure consistent deterministic sorting in config_persistence cleanup * Replace magic numbers with constants and remove debug logs - Added descriptive constants for truncation limits and timeouts in admin_server.go and worker_grpc_server.go - Replaced magic numbers with these constants throughout the codebase - Verified removal of stdout debug printing - Ensured consistent truncation logic during log persistence * Address code review feedback on history truncation and logging logic - Fix AssignmentHistory double-serialization by copying task in GetMaintenanceTaskDetail - Fix handleTaskCompletion logging logic (mutually exclusive success/failure logs) - Remove unused Timeout field from LogRequestContext and sync select timeouts with constants - Ensure AssignmentHistory is only provided in the top-level field for better JSON structure * Implement goroutine leak protection and request deduplication - Add request deduplication in RequestTaskLogs to prevent multiple concurrent fetches for the same task - Implement safe cleanup in timeout handlers to avoid race conditions in pendingLogRequests map - Add a 10s cooldown for background log refreshes in GetMaintenanceTaskDetail to prevent spamming - Ensure all persistent log-fetching goroutines are bounded and efficiently managed * Fix potential nil pointer panics in maintenance handlers - Add nil checks for adminServer in ShowTaskDetail, ShowMaintenanceWorkers, and UpdateTaskConfig - Update getMaintenanceQueueData to return a descriptive error instead of nil when adminServer is uninitialized - Ensure internal helper methods consistently check for adminServer initialization before use * Strictly enforce disk-only log reading - Remove background log fetching from GetMaintenanceTaskDetail to prevent timeouts and network calls during page view - Remove unused lastLogFetch tracking fields to clean up dead code - Ensure logs are only updated upon task completion via handleTaskCompletion * Refactor GetWorkerLogs to read from disk - Update /api/maintenance/workers/:id/logs endpoint to use configPersistence.LoadTaskExecutionLogs - Remove synchronous gRPC call RequestTaskLogs to prevent timeouts and bad gateway errors - Ensure consistent log retrieval behavior across the application (disk-only) * Fix timestamp parsing in log viewer - Update task_detail.templ JS to handle both ISO 8601 strings and Unix timestamps - Fix "Invalid time value" error when displaying logs fetched from disk - Regenerate templates * master: fallback to HDD if SSD volumes are full in Assign * worker: improve EC detection logging and fix skip counters * worker: add Sync method to TaskLogger interface * worker: implement Sync and ensure logs are flushed before task completion * admin: improve task log retrieval with retries and better timeouts * admin: robust timestamp parsing in task detail view
2026-02-04 08:48:55 -08:00
parent 2ff1cd9fc9
commit 72a8f598f2
51 changed files with 499 additions and 241 deletions
--- a/weed/admin/dash/worker_grpc_server.go
+++ b/weed/admin/dash/worker_grpc_server.go
@@ -8,6 +8,7 @@ import (
 	"sync"
 	"time"

+	"github.com/seaweedfs/seaweedfs/weed/admin/maintenance"
 	"github.com/seaweedfs/seaweedfs/weed/glog"
 	"github.com/seaweedfs/seaweedfs/weed/pb"
 	"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
@@ -17,6 +18,15 @@ import (
 	"google.golang.org/grpc/peer"
 )

+const (
+	maxLogFetchLimit   = 1000
+	maxLogMessageSize  = 2000
+	maxLogFieldsCount  = 20
+	logRequestTimeout  = 10 * time.Second
+	logResponseTimeout = 30 * time.Second
+	logSendTimeout     = 10 * time.Second
+)
+
 // WorkerGrpcServer implements the WorkerService gRPC interface
 type WorkerGrpcServer struct {
 	worker_pb.UnimplementedWorkerServiceServer
@@ -42,7 +52,6 @@ type LogRequestContext struct {
 	TaskID     string
 	WorkerID   string
 	ResponseCh chan *worker_pb.TaskLogResponse
-	Timeout    time.Time
 }

 // WorkerConnection represents an active worker connection
@@ -89,8 +98,9 @@ func (s *WorkerGrpcServer) StartWithTLS(port int) error {
 	s.listener = listener
 	s.running = true

-	// Start cleanup routine
+	// Start background routines
 	go s.cleanupRoutine()
+	go s.activeLogFetchLoop()

 	// Start serving in a goroutine
 	go func() {
@@ -437,9 +447,90 @@ func (s *WorkerGrpcServer) handleTaskCompletion(conn *WorkerConnection, completi
 		} else {
 			glog.Errorf("Worker %s failed task %s: %s", conn.workerID, completion.TaskId, completion.ErrorMessage)
 		}
+
+		// Fetch and persist logs
+		go s.FetchAndSaveLogs(conn.workerID, completion.TaskId)
 	}
 }

+// FetchAndSaveLogs retrieves logs from a worker and saves them to disk
+func (s *WorkerGrpcServer) FetchAndSaveLogs(workerID, taskID string) error {
+	// Add a small initial delay to allow worker to finalize and sync logs
+	// especially when this is called immediately after TaskComplete
+	time.Sleep(300 * time.Millisecond)
+
+	var workerLogs []*worker_pb.TaskLogEntry
+	var err error
+
+	// Retry a few times if fetch fails, as logs might be in the middle of a terminal sync
+	for attempt := 1; attempt <= 3; attempt++ {
+		workerLogs, err = s.RequestTaskLogs(workerID, taskID, maxLogFetchLimit, "")
+		if err == nil {
+			break
+		}
+		if attempt < 3 {
+			glog.V(1).Infof("Fetch logs attempt %d failed for task %s: %v. Retrying in 1s...", attempt, taskID, err)
+			time.Sleep(1 * time.Second)
+		}
+	}
+
+	if err != nil {
+		glog.Warningf("Failed to fetch logs for task %s after 3 attempts: %v", taskID, err)
+		return err
+	}
+
+	// Convert logs
+	var maintenanceLogs []*maintenance.TaskExecutionLog
+	for _, workerLog := range workerLogs {
+		maintenanceLog := &maintenance.TaskExecutionLog{
+			Timestamp: time.Unix(workerLog.Timestamp, 0),
+			Level:     workerLog.Level,
+			Message:   workerLog.Message,
+			Source:    "worker",
+			TaskID:    taskID,
+			WorkerID:  workerID,
+		}
+
+		// Truncate very long messages to prevent rendering issues and disk bloat
+		if len(maintenanceLog.Message) > maxLogMessageSize {
+			maintenanceLog.Message = maintenanceLog.Message[:maxLogMessageSize] + "... (truncated)"
+		}
+
+		// carry structured fields if present
+		if len(workerLog.Fields) > 0 {
+			maintenanceLog.Fields = make(map[string]string)
+			fieldCount := 0
+			for k, v := range workerLog.Fields {
+				if fieldCount >= maxLogFieldsCount {
+					maintenanceLog.Fields["..."] = fmt.Sprintf("(%d more fields truncated)", len(workerLog.Fields)-maxLogFieldsCount)
+					break
+				}
+				maintenanceLog.Fields[k] = v
+				fieldCount++
+			}
+		}
+
+		// carry optional progress/status
+		if workerLog.Progress != 0 {
+			p := float64(workerLog.Progress)
+			maintenanceLog.Progress = &p
+		}
+		if workerLog.Status != "" {
+			maintenanceLog.Status = workerLog.Status
+		}
+		maintenanceLogs = append(maintenanceLogs, maintenanceLog)
+	}
+
+	// Persist logs
+	if s.adminServer.configPersistence != nil {
+		if err := s.adminServer.configPersistence.SaveTaskExecutionLogs(taskID, maintenanceLogs); err != nil {
+			glog.Errorf("Failed to persist logs for task %s: %v", taskID, err)
+			return err
+		}
+	}
+	return nil
+}
+
 // handleTaskLogResponse processes task log responses from workers
 func (s *WorkerGrpcServer) handleTaskLogResponse(conn *WorkerConnection, response *worker_pb.TaskLogResponse) {
 	requestKey := fmt.Sprintf("%s:%s", response.WorkerId, response.TaskId)
@@ -575,10 +666,13 @@ func (s *WorkerGrpcServer) RequestTaskLogs(workerID, taskID string, maxEntries i
 		TaskID:     taskID,
 		WorkerID:   workerID,
 		ResponseCh: responseCh,
-		Timeout:    time.Now().Add(10 * time.Second),
 	}

 	s.logRequestsMutex.Lock()
+	if _, exists := s.pendingLogRequests[requestKey]; exists {
+		s.logRequestsMutex.Unlock()
+		return nil, fmt.Errorf("a log request for task %s is already in progress", taskID)
+	}
 	s.pendingLogRequests[requestKey] = requestContext
 	s.logRequestsMutex.Unlock()

@@ -601,10 +695,12 @@ func (s *WorkerGrpcServer) RequestTaskLogs(workerID, taskID string, maxEntries i
 	select {
 	case conn.outgoing <- logRequest:
 		glog.V(1).Infof("Log request sent to worker %s for task %s", workerID, taskID)
-	case <-time.After(5 * time.Second):
+	case <-time.After(logSendTimeout):
 		// Clean up pending request on timeout
 		s.logRequestsMutex.Lock()
-		delete(s.pendingLogRequests, requestKey)
+		if s.pendingLogRequests[requestKey] == requestContext {
+			delete(s.pendingLogRequests, requestKey)
+		}
 		s.logRequestsMutex.Unlock()
 		return nil, fmt.Errorf("timeout sending log request to worker %s", workerID)
 	}
@@ -617,10 +713,12 @@ func (s *WorkerGrpcServer) RequestTaskLogs(workerID, taskID string, maxEntries i
 		}
 		glog.V(1).Infof("Received %d log entries for task %s from worker %s", len(response.LogEntries), taskID, workerID)
 		return response.LogEntries, nil
-	case <-time.After(10 * time.Second):
+	case <-time.After(logResponseTimeout):
 		// Clean up pending request on timeout
 		s.logRequestsMutex.Lock()
-		delete(s.pendingLogRequests, requestKey)
+		if s.pendingLogRequests[requestKey] == requestContext {
+			delete(s.pendingLogRequests, requestKey)
+		}
 		s.logRequestsMutex.Unlock()
 		return nil, fmt.Errorf("timeout waiting for log response from worker %s", workerID)
 	}
@@ -684,3 +782,38 @@ func findClientAddress(ctx context.Context) string {
 	}
 	return pr.Addr.String()
 }
+
+// activeLogFetchLoop periodically fetches logs for all in-progress tasks
+func (s *WorkerGrpcServer) activeLogFetchLoop() {
+	ticker := time.NewTicker(30 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-s.stopChan:
+			return
+		case <-ticker.C:
+			if !s.running || s.adminServer == nil || s.adminServer.maintenanceManager == nil {
+				continue
+			}
+
+			// Get all in-progress tasks
+			tasks := s.adminServer.maintenanceManager.GetTasks(maintenance.TaskStatusInProgress, "", 0)
+			if len(tasks) == 0 {
+				continue
+			}
+
+			glog.V(2).Infof("Background log fetcher: found %d in-progress tasks", len(tasks))
+			for _, task := range tasks {
+				if task.WorkerID != "" {
+					// Use a goroutine to avoid blocking the loop
+					go func(wID, tID string) {
+						if err := s.FetchAndSaveLogs(wID, tID); err != nil {
+							glog.V(2).Infof("Background log fetch failed for task %s on worker %s: %v", tID, wID, err)
+						}
+					}(task.WorkerID, task.ID)
+				}
+			}
+		}
+	}
+}