Fix Maintenance Task Sorting and Refactor Log Persistence (#8199)

* fix float stepping

* do not auto refresh

* only logs when non 200 status

* fix maintenance task sorting and cleanup redundant handler logic

* Refactor log retrieval to persist to disk and fix slowness

- Move log retrieval to disk-based persistence in GetMaintenanceTaskDetail
- Implement background log fetching on task completion in worker_grpc_server.go
- Implement async background refresh for in-progress tasks
- Completely remove blocking gRPC calls from the UI path to fix 10s timeouts
- Cleanup debug logs and performance profiling code

* Ensure consistent deterministic sorting in config_persistence cleanup

* Replace magic numbers with constants and remove debug logs

- Added descriptive constants for truncation limits and timeouts in admin_server.go and worker_grpc_server.go
- Replaced magic numbers with these constants throughout the codebase
- Verified removal of stdout debug printing
- Ensured consistent truncation logic during log persistence

* Address code review feedback on history truncation and logging logic

- Fix AssignmentHistory double-serialization by copying task in GetMaintenanceTaskDetail
- Fix handleTaskCompletion logging logic (mutually exclusive success/failure logs)
- Remove unused Timeout field from LogRequestContext and sync select timeouts with constants
- Ensure AssignmentHistory is only provided in the top-level field for better JSON structure

* Implement goroutine leak protection and request deduplication

- Add request deduplication in RequestTaskLogs to prevent multiple concurrent fetches for the same task
- Implement safe cleanup in timeout handlers to avoid race conditions in pendingLogRequests map
- Add a 10s cooldown for background log refreshes in GetMaintenanceTaskDetail to prevent spamming
- Ensure all persistent log-fetching goroutines are bounded and efficiently managed

* Fix potential nil pointer panics in maintenance handlers

- Add nil checks for adminServer in ShowTaskDetail, ShowMaintenanceWorkers, and UpdateTaskConfig
- Update getMaintenanceQueueData to return a descriptive error instead of nil when adminServer is uninitialized
- Ensure internal helper methods consistently check for adminServer initialization before use

* Strictly enforce disk-only log reading

- Remove background log fetching from GetMaintenanceTaskDetail to prevent timeouts and network calls during page view
- Remove unused lastLogFetch tracking fields to clean up dead code
- Ensure logs are only updated upon task completion via handleTaskCompletion

* Refactor GetWorkerLogs to read from disk

- Update /api/maintenance/workers/:id/logs endpoint to use configPersistence.LoadTaskExecutionLogs
- Remove synchronous gRPC call RequestTaskLogs to prevent timeouts and bad gateway errors
- Ensure consistent log retrieval behavior across the application (disk-only)

* Fix timestamp parsing in log viewer

- Update task_detail.templ JS to handle both ISO 8601 strings and Unix timestamps
- Fix "Invalid time value" error when displaying logs fetched from disk
- Regenerate templates

* master: fallback to HDD if SSD volumes are full in Assign

* worker: improve EC detection logging and fix skip counters

* worker: add Sync method to TaskLogger interface

* worker: implement Sync and ensure logs are flushed before task completion

* admin: improve task log retrieval with retries and better timeouts

* admin: robust timestamp parsing in task detail view
This commit is contained in:
Chris Lu
2026-02-04 08:48:55 -08:00
committed by GitHub
parent 2ff1cd9fc9
commit 72a8f598f2
51 changed files with 499 additions and 241 deletions

View File

@@ -241,13 +241,15 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
results = append(results, result)
} else {
// Count debug reasons
if metric.Age < quietThreshold {
skippedQuietTime++
}
if metric.FullnessRatio < ecConfig.FullnessRatio {
skippedFullness++
}
if debugCount < 5 { // Limit to avoid spam
if metric.Age < quietThreshold {
skippedQuietTime++
}
if metric.FullnessRatio < ecConfig.FullnessRatio {
skippedFullness++
}
// Logic moved outside
}
debugCount++
}
@@ -256,7 +258,7 @@ func Detection(metrics []*types.VolumeHealthMetrics, clusterInfo *types.ClusterI
// Log debug summary if no tasks were created
if len(results) == 0 && len(metrics) > 0 {
totalVolumes := len(metrics)
glog.V(1).Infof("EC detection: No tasks created for %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
glog.Infof("EC detection: No tasks created for %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
totalVolumes, skippedAlreadyEC, skippedTooSmall, skippedCollectionFilter, skippedQuietTime, skippedFullness)
// Show details for first few volumes

View File

@@ -30,6 +30,7 @@ type TaskLogger interface {
LogWithFields(level string, message string, fields map[string]interface{})
// Lifecycle
Sync() error
Close() error
GetLogDir() string
}
@@ -230,6 +231,17 @@ func (l *FileTaskLogger) LogWithFields(level string, message string, fields map[
l.writeLogEntry(entry)
}
// Sync flushes buffered data to disk
func (l *FileTaskLogger) Sync() error {
l.mutex.Lock()
defer l.mutex.Unlock()
if l.logFile != nil {
return l.logFile.Sync()
}
return nil
}
// Close closes the logger and finalizes metadata
func (l *FileTaskLogger) Close() error {
l.Info("Task logger closed for %s", l.taskID)
@@ -423,7 +435,10 @@ func ReadTaskLogs(logDir string) ([]TaskLogEntry, error) {
if err == io.EOF {
break
}
return nil, fmt.Errorf("failed to decode log entry: %w", err)
// If we fail to decode an entry, it might be a partial write at the end of the file
// Return what we have so far instead of failing the entire request
glog.V(1).Infof("Failed to decode log entry in %s: %v (returning %d partial logs)", logPath, err, len(entries))
break
}
entries = append(entries, entry)
}

View File

@@ -19,6 +19,7 @@ type TaskLogger interface {
Error(message string, args ...interface{})
Debug(message string, args ...interface{})
LogWithFields(level string, message string, fields map[string]interface{})
Sync() error
Close() error
}

View File

@@ -707,6 +707,9 @@ func (w *Worker) executeTask(task *types.TaskInput) {
err = taskInstance.Execute(ctx, task.TypedParams)
// Report completion
if fileLogger != nil {
fileLogger.Sync()
}
if err != nil {
w.completeTask(task.ID, false, err.Error())
w.cmds <- workerCommand{
@@ -718,14 +721,15 @@ func (w *Worker) executeTask(task *types.TaskInput) {
fileLogger.Error("Task %s failed: %v", task.ID, err)
}
} else {
if fileLogger != nil {
fileLogger.Info("Task %s completed successfully", task.ID)
fileLogger.Sync()
}
w.completeTask(task.ID, true, "")
w.cmds <- workerCommand{
action: ActionIncTaskComplete,
}
glog.Infof("Worker %s completed task %s successfully", w.id, task.ID)
if fileLogger != nil {
fileLogger.Info("Task %s completed successfully", task.ID)
}
}
}