Fix Maintenance Task Sorting and Refactor Log Persistence (#8199)

* fix float stepping * do not auto refresh * only logs when non 200 status * fix maintenance task sorting and cleanup redundant handler logic * Refactor log retrieval to persist to disk and fix slowness - Move log retrieval to disk-based persistence in GetMaintenanceTaskDetail - Implement background log fetching on task completion in worker_grpc_server.go - Implement async background refresh for in-progress tasks - Completely remove blocking gRPC calls from the UI path to fix 10s timeouts - Cleanup debug logs and performance profiling code * Ensure consistent deterministic sorting in config_persistence cleanup * Replace magic numbers with constants and remove debug logs - Added descriptive constants for truncation limits and timeouts in admin_server.go and worker_grpc_server.go - Replaced magic numbers with these constants throughout the codebase - Verified removal of stdout debug printing - Ensured consistent truncation logic during log persistence * Address code review feedback on history truncation and logging logic - Fix AssignmentHistory double-serialization by copying task in GetMaintenanceTaskDetail - Fix handleTaskCompletion logging logic (mutually exclusive success/failure logs) - Remove unused Timeout field from LogRequestContext and sync select timeouts with constants - Ensure AssignmentHistory is only provided in the top-level field for better JSON structure * Implement goroutine leak protection and request deduplication - Add request deduplication in RequestTaskLogs to prevent multiple concurrent fetches for the same task - Implement safe cleanup in timeout handlers to avoid race conditions in pendingLogRequests map - Add a 10s cooldown for background log refreshes in GetMaintenanceTaskDetail to prevent spamming - Ensure all persistent log-fetching goroutines are bounded and efficiently managed * Fix potential nil pointer panics in maintenance handlers - Add nil checks for adminServer in ShowTaskDetail, ShowMaintenanceWorkers, and UpdateTaskConfig - Update getMaintenanceQueueData to return a descriptive error instead of nil when adminServer is uninitialized - Ensure internal helper methods consistently check for adminServer initialization before use * Strictly enforce disk-only log reading - Remove background log fetching from GetMaintenanceTaskDetail to prevent timeouts and network calls during page view - Remove unused lastLogFetch tracking fields to clean up dead code - Ensure logs are only updated upon task completion via handleTaskCompletion * Refactor GetWorkerLogs to read from disk - Update /api/maintenance/workers/:id/logs endpoint to use configPersistence.LoadTaskExecutionLogs - Remove synchronous gRPC call RequestTaskLogs to prevent timeouts and bad gateway errors - Ensure consistent log retrieval behavior across the application (disk-only) * Fix timestamp parsing in log viewer - Update task_detail.templ JS to handle both ISO 8601 strings and Unix timestamps - Fix "Invalid time value" error when displaying logs fetched from disk - Regenerate templates * master: fallback to HDD if SSD volumes are full in Assign * worker: improve EC detection logging and fix skip counters * worker: add Sync method to TaskLogger interface * worker: implement Sync and ensure logs are flushed before task completion * admin: improve task log retrieval with retries and better timeouts * admin: robust timestamp parsing in task detail view
2026-02-04 08:48:55 -08:00
parent 2ff1cd9fc9
commit 72a8f598f2
51 changed files with 499 additions and 241 deletions
--- a/weed/worker/tasks/erasure_coding/detection.go
+++ b/weed/worker/tasks/erasure_coding/detection.go
@@ -241,13 +241,15 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
 			results = append(results, result)
 		} else {
 			// Count debug reasons
+			if metric.Age < quietThreshold {
+				skippedQuietTime++
+			}
+			if metric.FullnessRatio < ecConfig.FullnessRatio {
+				skippedFullness++
+			}
+
 			if debugCount < 5 { // Limit to avoid spam
-				if metric.Age < quietThreshold {
-					skippedQuietTime++
-				}
-				if metric.FullnessRatio < ecConfig.FullnessRatio {
-					skippedFullness++
-				}
+				// Logic moved outside
 			}
 			debugCount++
 		}
@@ -256,7 +258,7 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
 	// Log debug summary if no tasks were created
 	if len(results) == 0 && len(metrics) > 0 {
 		totalVolumes := len(metrics)
-		glog.V(1).Infof("EC detection: No tasks created for %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
+		glog.Infof("EC detection: No tasks created for %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
 			totalVolumes, skippedAlreadyEC, skippedTooSmall, skippedCollectionFilter, skippedQuietTime, skippedFullness)

 		// Show details for first few volumes
--- a/weed/worker/tasks/task_logger.go
+++ b/weed/worker/tasks/task_logger.go
@@ -30,6 +30,7 @@ type TaskLogger interface {
 	LogWithFields(level string, message string, fields map[string]interface{})

 	// Lifecycle
+	Sync() error
 	Close() error
 	GetLogDir() string
 }
@@ -230,6 +231,17 @@ func (l *FileTaskLogger) LogWithFields(level string, message string, fields map[
 	l.writeLogEntry(entry)
 }

+// Sync flushes buffered data to disk
+func (l *FileTaskLogger) Sync() error {
+	l.mutex.Lock()
+	defer l.mutex.Unlock()
+
+	if l.logFile != nil {
+		return l.logFile.Sync()
+	}
+	return nil
+}
+
 // Close closes the logger and finalizes metadata
 func (l *FileTaskLogger) Close() error {
 	l.Info("Task logger closed for %s", l.taskID)
@@ -423,7 +435,10 @@ func ReadTaskLogs(logDir string) ([]TaskLogEntry, error) {
 			if err == io.EOF {
 				break
 			}
-			return nil, fmt.Errorf("failed to decode log entry: %w", err)
+			// If we fail to decode an entry, it might be a partial write at the end of the file
+			// Return what we have so far instead of failing the entire request
+			glog.V(1).Infof("Failed to decode log entry in %s: %v (returning %d partial logs)", logPath, err, len(entries))
+			break
 		}
 		entries = append(entries, entry)
 	}
--- a/weed/worker/types/typed_task_interface.go
+++ b/weed/worker/types/typed_task_interface.go
@@ -19,6 +19,7 @@ type TaskLogger interface {
 	Error(message string, args ...interface{})
 	Debug(message string, args ...interface{})
 	LogWithFields(level string, message string, fields map[string]interface{})
+	Sync() error
 	Close() error
 }

--- a/weed/worker/worker.go
+++ b/weed/worker/worker.go
@@ -707,6 +707,9 @@ func (w *Worker) executeTask(task *types.TaskInput) {
 	err = taskInstance.Execute(ctx, task.TypedParams)

 	// Report completion
+	if fileLogger != nil {
+		fileLogger.Sync()
+	}
 	if err != nil {
 		w.completeTask(task.ID, false, err.Error())
 		w.cmds <- workerCommand{
@@ -718,14 +721,15 @@ func (w *Worker) executeTask(task *types.TaskInput) {
 			fileLogger.Error("Task %s failed: %v", task.ID, err)
 		}
 	} else {
+		if fileLogger != nil {
+			fileLogger.Info("Task %s completed successfully", task.ID)
+			fileLogger.Sync()
+		}
 		w.completeTask(task.ID, true, "")
 		w.cmds <- workerCommand{
 			action: ActionIncTaskComplete,
 		}
 		glog.Infof("Worker %s completed task %s successfully", w.id, task.ID)
-		if fileLogger != nil {
-			fileLogger.Info("Task %s completed successfully", task.ID)
-		}
 	}
 }