Fix Maintenance Task Sorting and Refactor Log Persistence (#8199)

* fix float stepping

* do not auto refresh

* only logs when non 200 status

* fix maintenance task sorting and cleanup redundant handler logic

* Refactor log retrieval to persist to disk and fix slowness

- Move log retrieval to disk-based persistence in GetMaintenanceTaskDetail
- Implement background log fetching on task completion in worker_grpc_server.go
- Implement async background refresh for in-progress tasks
- Completely remove blocking gRPC calls from the UI path to fix 10s timeouts
- Cleanup debug logs and performance profiling code

* Ensure consistent deterministic sorting in config_persistence cleanup

* Replace magic numbers with constants and remove debug logs

- Added descriptive constants for truncation limits and timeouts in admin_server.go and worker_grpc_server.go
- Replaced magic numbers with these constants throughout the codebase
- Verified removal of stdout debug printing
- Ensured consistent truncation logic during log persistence

* Address code review feedback on history truncation and logging logic

- Fix AssignmentHistory double-serialization by copying task in GetMaintenanceTaskDetail
- Fix handleTaskCompletion logging logic (mutually exclusive success/failure logs)
- Remove unused Timeout field from LogRequestContext and sync select timeouts with constants
- Ensure AssignmentHistory is only provided in the top-level field for better JSON structure

* Implement goroutine leak protection and request deduplication

- Add request deduplication in RequestTaskLogs to prevent multiple concurrent fetches for the same task
- Implement safe cleanup in timeout handlers to avoid race conditions in pendingLogRequests map
- Add a 10s cooldown for background log refreshes in GetMaintenanceTaskDetail to prevent spamming
- Ensure all persistent log-fetching goroutines are bounded and efficiently managed

* Fix potential nil pointer panics in maintenance handlers

- Add nil checks for adminServer in ShowTaskDetail, ShowMaintenanceWorkers, and UpdateTaskConfig
- Update getMaintenanceQueueData to return a descriptive error instead of nil when adminServer is uninitialized
- Ensure internal helper methods consistently check for adminServer initialization before use

* Strictly enforce disk-only log reading

- Remove background log fetching from GetMaintenanceTaskDetail to prevent timeouts and network calls during page view
- Remove unused lastLogFetch tracking fields to clean up dead code
- Ensure logs are only updated upon task completion via handleTaskCompletion

* Refactor GetWorkerLogs to read from disk

- Update /api/maintenance/workers/:id/logs endpoint to use configPersistence.LoadTaskExecutionLogs
- Remove synchronous gRPC call RequestTaskLogs to prevent timeouts and bad gateway errors
- Ensure consistent log retrieval behavior across the application (disk-only)

* Fix timestamp parsing in log viewer

- Update task_detail.templ JS to handle both ISO 8601 strings and Unix timestamps
- Fix "Invalid time value" error when displaying logs fetched from disk
- Regenerate templates

* master: fallback to HDD if SSD volumes are full in Assign

* worker: improve EC detection logging and fix skip counters

* worker: add Sync method to TaskLogger interface

* worker: implement Sync and ensure logs are flushed before task completion

* admin: improve task log retrieval with retries and better timeouts

* admin: robust timestamp parsing in task detail view
This commit is contained in:
Chris Lu
2026-02-04 08:48:55 -08:00
committed by GitHub
parent 2ff1cd9fc9
commit 72a8f598f2
51 changed files with 499 additions and 241 deletions

View File

@@ -39,6 +39,11 @@ func NewMaintenanceHandlers(adminServer *dash.AdminServer) *MaintenanceHandlers
func (h *MaintenanceHandlers) ShowTaskDetail(c *gin.Context) {
taskID := c.Param("id")
if h.adminServer == nil {
c.String(http.StatusInternalServerError, "Admin server not initialized")
return
}
taskDetail, err := h.adminServer.GetMaintenanceTaskDetail(taskID)
if err != nil {
glog.Errorf("DEBUG ShowTaskDetail: error getting task detail for %s: %v", taskID, err)
@@ -111,6 +116,10 @@ func (h *MaintenanceHandlers) ShowMaintenanceQueue(c *gin.Context) {
// ShowMaintenanceWorkers displays the maintenance workers page
func (h *MaintenanceHandlers) ShowMaintenanceWorkers(c *gin.Context) {
if h.adminServer == nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Admin server not initialized"})
return
}
workersData, err := h.adminServer.GetMaintenanceWorkersData()
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -339,6 +348,8 @@ func (h *MaintenanceHandlers) UpdateTaskConfig(c *gin.Context) {
glog.Warningf("Failed to save task config to protobuf file: %v", err)
// Don't fail the request, just log the warning
}
} else if h.adminServer == nil {
glog.Warningf("Failed to save task config: admin server not initialized")
}
// Trigger a configuration reload in the maintenance manager
@@ -492,74 +503,25 @@ func (h *MaintenanceHandlers) UpdateMaintenanceConfig(c *gin.Context) {
// Helper methods that delegate to AdminServer
func (h *MaintenanceHandlers) getMaintenanceQueueData() (*maintenance.MaintenanceQueueData, error) {
tasks, err := h.getMaintenanceTasks()
if err != nil {
return nil, err
}
workers, err := h.getMaintenanceWorkers()
if err != nil {
return nil, err
}
stats, err := h.getMaintenanceQueueStats()
if err != nil {
return nil, err
}
data := &maintenance.MaintenanceQueueData{
Tasks: tasks,
Workers: workers,
Stats: stats,
LastUpdated: time.Now(),
}
return data, nil
}
func (h *MaintenanceHandlers) getMaintenanceQueueStats() (*maintenance.QueueStats, error) {
// Use the exported method from AdminServer
return h.adminServer.GetMaintenanceQueueStats()
}
func (h *MaintenanceHandlers) getMaintenanceTasks() ([]*maintenance.MaintenanceTask, error) {
// Call the maintenance manager directly to get recent tasks (limit for performance)
if h.adminServer == nil {
return []*maintenance.MaintenanceTask{}, nil
return nil, fmt.Errorf("admin server not initialized")
}
manager := h.adminServer.GetMaintenanceManager()
if manager == nil {
return []*maintenance.MaintenanceTask{}, nil
}
// Get recent tasks only (last 100) to prevent slow page loads
// Users can view more tasks via pagination if needed
allTasks := manager.GetTasks("", "", 100)
return allTasks, nil
}
func (h *MaintenanceHandlers) getMaintenanceWorkers() ([]*maintenance.MaintenanceWorker, error) {
// Get workers from the admin server's maintenance manager
if h.adminServer == nil {
return []*maintenance.MaintenanceWorker{}, nil
}
if h.adminServer.GetMaintenanceManager() == nil {
return []*maintenance.MaintenanceWorker{}, nil
}
// Get workers from the maintenance manager
workers := h.adminServer.GetMaintenanceManager().GetWorkers()
return workers, nil
// Use the exported method from AdminServer used by the JSON API
return h.adminServer.GetMaintenanceQueueData()
}
func (h *MaintenanceHandlers) getMaintenanceConfig() (*maintenance.MaintenanceConfigData, error) {
if h.adminServer == nil {
return nil, fmt.Errorf("admin server not initialized")
}
// Delegate to AdminServer's real persistence method
return h.adminServer.GetMaintenanceConfigData()
}
func (h *MaintenanceHandlers) updateMaintenanceConfig(config *maintenance.MaintenanceConfig) error {
if h.adminServer == nil {
return fmt.Errorf("admin server not initialized")
}
// Delegate to AdminServer's real persistence method
return h.adminServer.UpdateMaintenanceConfigData(config)
}