Admin UI add maintenance menu (#6944)
* add ui for maintenance * valid config loading. fix workers page. * refactor * grpc between admin and workers * add a long-running bidirectional grpc call between admin and worker * use the grpc call to heartbeat * use the grpc call to communicate * worker can remove the http client * admin uses http port + 10000 as its default grpc port * one task one package * handles connection failures gracefully with exponential backoff * grpc with insecure tls * grpc with optional tls * fix detecting tls * change time config from nano seconds to seconds * add tasks with 3 interfaces * compiles reducing hard coded * remove a couple of tasks * remove hard coded references * reduce hard coded values * remove hard coded values * remove hard coded from templ * refactor maintenance package * fix import cycle * simplify * simplify * auto register * auto register factory * auto register task types * self register types * refactor * simplify * remove one task * register ui * lazy init executor factories * use registered task types * DefaultWorkerConfig remove hard coded task types * remove more hard coded * implement get maintenance task * dynamic task configuration * "System Settings" should only have system level settings * adjust menu for tasks * ensure menu not collapsed * render job configuration well * use templ for ui of task configuration * fix ordering * fix bugs * saving duration in seconds * use value and unit for duration * Delete WORKER_REFACTORING_PLAN.md * Delete maintenance.json * Delete custom_worker_example.go * remove address from workers * remove old code from ec task * remove creating collection button * reconnect with exponential backoff * worker use security.toml * start admin server with tls info from security.toml * fix "weed admin" cli description
This commit is contained in:
500
weed/admin/maintenance/maintenance_queue.go
Normal file
500
weed/admin/maintenance/maintenance_queue.go
Normal file
@@ -0,0 +1,500 @@
|
||||
package maintenance
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
)
|
||||
|
||||
// NewMaintenanceQueue creates a new maintenance queue
|
||||
func NewMaintenanceQueue(policy *MaintenancePolicy) *MaintenanceQueue {
|
||||
queue := &MaintenanceQueue{
|
||||
tasks: make(map[string]*MaintenanceTask),
|
||||
workers: make(map[string]*MaintenanceWorker),
|
||||
pendingTasks: make([]*MaintenanceTask, 0),
|
||||
policy: policy,
|
||||
}
|
||||
return queue
|
||||
}
|
||||
|
||||
// SetIntegration sets the integration reference
|
||||
func (mq *MaintenanceQueue) SetIntegration(integration *MaintenanceIntegration) {
|
||||
mq.integration = integration
|
||||
glog.V(1).Infof("Maintenance queue configured with integration")
|
||||
}
|
||||
|
||||
// AddTask adds a new maintenance task to the queue
|
||||
func (mq *MaintenanceQueue) AddTask(task *MaintenanceTask) {
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
task.ID = generateTaskID()
|
||||
task.Status = TaskStatusPending
|
||||
task.CreatedAt = time.Now()
|
||||
task.MaxRetries = 3 // Default retry count
|
||||
|
||||
mq.tasks[task.ID] = task
|
||||
mq.pendingTasks = append(mq.pendingTasks, task)
|
||||
|
||||
// Sort pending tasks by priority and schedule time
|
||||
sort.Slice(mq.pendingTasks, func(i, j int) bool {
|
||||
if mq.pendingTasks[i].Priority != mq.pendingTasks[j].Priority {
|
||||
return mq.pendingTasks[i].Priority > mq.pendingTasks[j].Priority
|
||||
}
|
||||
return mq.pendingTasks[i].ScheduledAt.Before(mq.pendingTasks[j].ScheduledAt)
|
||||
})
|
||||
|
||||
glog.V(2).Infof("Added maintenance task %s: %s for volume %d", task.ID, task.Type, task.VolumeID)
|
||||
}
|
||||
|
||||
// AddTasksFromResults converts detection results to tasks and adds them to the queue
|
||||
func (mq *MaintenanceQueue) AddTasksFromResults(results []*TaskDetectionResult) {
|
||||
for _, result := range results {
|
||||
task := &MaintenanceTask{
|
||||
Type: result.TaskType,
|
||||
Priority: result.Priority,
|
||||
VolumeID: result.VolumeID,
|
||||
Server: result.Server,
|
||||
Collection: result.Collection,
|
||||
Parameters: result.Parameters,
|
||||
Reason: result.Reason,
|
||||
ScheduledAt: result.ScheduleAt,
|
||||
}
|
||||
mq.AddTask(task)
|
||||
}
|
||||
}
|
||||
|
||||
// GetNextTask returns the next available task for a worker
|
||||
func (mq *MaintenanceQueue) GetNextTask(workerID string, capabilities []MaintenanceTaskType) *MaintenanceTask {
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
worker, exists := mq.workers[workerID]
|
||||
if !exists {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if worker has capacity
|
||||
if worker.CurrentLoad >= worker.MaxConcurrent {
|
||||
return nil
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
|
||||
// Find the next suitable task
|
||||
for i, task := range mq.pendingTasks {
|
||||
// Check if it's time to execute the task
|
||||
if task.ScheduledAt.After(now) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if worker can handle this task type
|
||||
if !mq.workerCanHandle(task.Type, capabilities) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check scheduling logic - use simplified system if available, otherwise fallback
|
||||
if !mq.canScheduleTaskNow(task) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Assign task to worker
|
||||
task.Status = TaskStatusAssigned
|
||||
task.WorkerID = workerID
|
||||
startTime := now
|
||||
task.StartedAt = &startTime
|
||||
|
||||
// Remove from pending tasks
|
||||
mq.pendingTasks = append(mq.pendingTasks[:i], mq.pendingTasks[i+1:]...)
|
||||
|
||||
// Update worker
|
||||
worker.CurrentTask = task
|
||||
worker.CurrentLoad++
|
||||
worker.Status = "busy"
|
||||
|
||||
glog.V(2).Infof("Assigned task %s to worker %s", task.ID, workerID)
|
||||
return task
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// CompleteTask marks a task as completed
|
||||
func (mq *MaintenanceQueue) CompleteTask(taskID string, error string) {
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
task, exists := mq.tasks[taskID]
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
completedTime := time.Now()
|
||||
task.CompletedAt = &completedTime
|
||||
|
||||
if error != "" {
|
||||
task.Status = TaskStatusFailed
|
||||
task.Error = error
|
||||
|
||||
// Check if task should be retried
|
||||
if task.RetryCount < task.MaxRetries {
|
||||
task.RetryCount++
|
||||
task.Status = TaskStatusPending
|
||||
task.WorkerID = ""
|
||||
task.StartedAt = nil
|
||||
task.CompletedAt = nil
|
||||
task.Error = ""
|
||||
task.ScheduledAt = time.Now().Add(15 * time.Minute) // Retry delay
|
||||
|
||||
mq.pendingTasks = append(mq.pendingTasks, task)
|
||||
glog.V(2).Infof("Retrying task %s (attempt %d/%d)", taskID, task.RetryCount, task.MaxRetries)
|
||||
} else {
|
||||
glog.Errorf("Task %s failed permanently after %d retries: %s", taskID, task.MaxRetries, error)
|
||||
}
|
||||
} else {
|
||||
task.Status = TaskStatusCompleted
|
||||
task.Progress = 100
|
||||
glog.V(2).Infof("Task %s completed successfully", taskID)
|
||||
}
|
||||
|
||||
// Update worker
|
||||
if task.WorkerID != "" {
|
||||
if worker, exists := mq.workers[task.WorkerID]; exists {
|
||||
worker.CurrentTask = nil
|
||||
worker.CurrentLoad--
|
||||
if worker.CurrentLoad == 0 {
|
||||
worker.Status = "active"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateTaskProgress updates the progress of a running task
|
||||
func (mq *MaintenanceQueue) UpdateTaskProgress(taskID string, progress float64) {
|
||||
mq.mutex.RLock()
|
||||
defer mq.mutex.RUnlock()
|
||||
|
||||
if task, exists := mq.tasks[taskID]; exists {
|
||||
task.Progress = progress
|
||||
task.Status = TaskStatusInProgress
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterWorker registers a new worker
|
||||
func (mq *MaintenanceQueue) RegisterWorker(worker *MaintenanceWorker) {
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
worker.LastHeartbeat = time.Now()
|
||||
worker.Status = "active"
|
||||
worker.CurrentLoad = 0
|
||||
mq.workers[worker.ID] = worker
|
||||
|
||||
glog.V(1).Infof("Registered maintenance worker %s at %s", worker.ID, worker.Address)
|
||||
}
|
||||
|
||||
// UpdateWorkerHeartbeat updates worker heartbeat
|
||||
func (mq *MaintenanceQueue) UpdateWorkerHeartbeat(workerID string) {
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
if worker, exists := mq.workers[workerID]; exists {
|
||||
worker.LastHeartbeat = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
// GetRunningTaskCount returns the number of running tasks of a specific type
|
||||
func (mq *MaintenanceQueue) GetRunningTaskCount(taskType MaintenanceTaskType) int {
|
||||
mq.mutex.RLock()
|
||||
defer mq.mutex.RUnlock()
|
||||
|
||||
count := 0
|
||||
for _, task := range mq.tasks {
|
||||
if task.Type == taskType && (task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// WasTaskRecentlyCompleted checks if a similar task was recently completed
|
||||
func (mq *MaintenanceQueue) WasTaskRecentlyCompleted(taskType MaintenanceTaskType, volumeID uint32, server string, now time.Time) bool {
|
||||
mq.mutex.RLock()
|
||||
defer mq.mutex.RUnlock()
|
||||
|
||||
// Get the repeat prevention interval for this task type
|
||||
interval := mq.getRepeatPreventionInterval(taskType)
|
||||
cutoff := now.Add(-interval)
|
||||
|
||||
for _, task := range mq.tasks {
|
||||
if task.Type == taskType &&
|
||||
task.VolumeID == volumeID &&
|
||||
task.Server == server &&
|
||||
task.Status == TaskStatusCompleted &&
|
||||
task.CompletedAt != nil &&
|
||||
task.CompletedAt.After(cutoff) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// getRepeatPreventionInterval returns the interval for preventing task repetition
|
||||
func (mq *MaintenanceQueue) getRepeatPreventionInterval(taskType MaintenanceTaskType) time.Duration {
|
||||
// First try to get default from task scheduler
|
||||
if mq.integration != nil {
|
||||
if scheduler := mq.integration.GetTaskScheduler(taskType); scheduler != nil {
|
||||
defaultInterval := scheduler.GetDefaultRepeatInterval()
|
||||
if defaultInterval > 0 {
|
||||
glog.V(3).Infof("Using task scheduler default repeat interval for %s: %v", taskType, defaultInterval)
|
||||
return defaultInterval
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to policy configuration if no scheduler available or scheduler doesn't provide default
|
||||
if mq.policy != nil {
|
||||
repeatIntervalHours := mq.policy.GetRepeatInterval(taskType)
|
||||
if repeatIntervalHours > 0 {
|
||||
interval := time.Duration(repeatIntervalHours) * time.Hour
|
||||
glog.V(3).Infof("Using policy configuration repeat interval for %s: %v", taskType, interval)
|
||||
return interval
|
||||
}
|
||||
}
|
||||
|
||||
// Ultimate fallback - but avoid hardcoded values where possible
|
||||
glog.V(2).Infof("No scheduler or policy configuration found for task type %s, using minimal default: 1h", taskType)
|
||||
return time.Hour // Minimal safe default
|
||||
}
|
||||
|
||||
// GetTasks returns tasks with optional filtering
|
||||
func (mq *MaintenanceQueue) GetTasks(status MaintenanceTaskStatus, taskType MaintenanceTaskType, limit int) []*MaintenanceTask {
|
||||
mq.mutex.RLock()
|
||||
defer mq.mutex.RUnlock()
|
||||
|
||||
var tasks []*MaintenanceTask
|
||||
for _, task := range mq.tasks {
|
||||
if status != "" && task.Status != status {
|
||||
continue
|
||||
}
|
||||
if taskType != "" && task.Type != taskType {
|
||||
continue
|
||||
}
|
||||
tasks = append(tasks, task)
|
||||
if limit > 0 && len(tasks) >= limit {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by creation time (newest first)
|
||||
sort.Slice(tasks, func(i, j int) bool {
|
||||
return tasks[i].CreatedAt.After(tasks[j].CreatedAt)
|
||||
})
|
||||
|
||||
return tasks
|
||||
}
|
||||
|
||||
// GetWorkers returns all registered workers
|
||||
func (mq *MaintenanceQueue) GetWorkers() []*MaintenanceWorker {
|
||||
mq.mutex.RLock()
|
||||
defer mq.mutex.RUnlock()
|
||||
|
||||
var workers []*MaintenanceWorker
|
||||
for _, worker := range mq.workers {
|
||||
workers = append(workers, worker)
|
||||
}
|
||||
return workers
|
||||
}
|
||||
|
||||
// generateTaskID generates a unique ID for tasks
|
||||
func generateTaskID() string {
|
||||
const charset = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
b := make([]byte, 8)
|
||||
for i := range b {
|
||||
b[i] = charset[i%len(charset)]
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// CleanupOldTasks removes old completed and failed tasks
|
||||
func (mq *MaintenanceQueue) CleanupOldTasks(retention time.Duration) int {
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
cutoff := time.Now().Add(-retention)
|
||||
removed := 0
|
||||
|
||||
for id, task := range mq.tasks {
|
||||
if (task.Status == TaskStatusCompleted || task.Status == TaskStatusFailed) &&
|
||||
task.CompletedAt != nil &&
|
||||
task.CompletedAt.Before(cutoff) {
|
||||
delete(mq.tasks, id)
|
||||
removed++
|
||||
}
|
||||
}
|
||||
|
||||
glog.V(2).Infof("Cleaned up %d old maintenance tasks", removed)
|
||||
return removed
|
||||
}
|
||||
|
||||
// RemoveStaleWorkers removes workers that haven't sent heartbeat recently
|
||||
func (mq *MaintenanceQueue) RemoveStaleWorkers(timeout time.Duration) int {
|
||||
mq.mutex.Lock()
|
||||
defer mq.mutex.Unlock()
|
||||
|
||||
cutoff := time.Now().Add(-timeout)
|
||||
removed := 0
|
||||
|
||||
for id, worker := range mq.workers {
|
||||
if worker.LastHeartbeat.Before(cutoff) {
|
||||
// Mark any assigned tasks as failed
|
||||
for _, task := range mq.tasks {
|
||||
if task.WorkerID == id && (task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress) {
|
||||
task.Status = TaskStatusFailed
|
||||
task.Error = "Worker became unavailable"
|
||||
completedTime := time.Now()
|
||||
task.CompletedAt = &completedTime
|
||||
}
|
||||
}
|
||||
|
||||
delete(mq.workers, id)
|
||||
removed++
|
||||
glog.Warningf("Removed stale maintenance worker %s", id)
|
||||
}
|
||||
}
|
||||
|
||||
return removed
|
||||
}
|
||||
|
||||
// GetStats returns maintenance statistics
|
||||
func (mq *MaintenanceQueue) GetStats() *MaintenanceStats {
|
||||
mq.mutex.RLock()
|
||||
defer mq.mutex.RUnlock()
|
||||
|
||||
stats := &MaintenanceStats{
|
||||
TotalTasks: len(mq.tasks),
|
||||
TasksByStatus: make(map[MaintenanceTaskStatus]int),
|
||||
TasksByType: make(map[MaintenanceTaskType]int),
|
||||
ActiveWorkers: 0,
|
||||
}
|
||||
|
||||
today := time.Now().Truncate(24 * time.Hour)
|
||||
var totalDuration time.Duration
|
||||
var completedTasks int
|
||||
|
||||
for _, task := range mq.tasks {
|
||||
stats.TasksByStatus[task.Status]++
|
||||
stats.TasksByType[task.Type]++
|
||||
|
||||
if task.CompletedAt != nil && task.CompletedAt.After(today) {
|
||||
if task.Status == TaskStatusCompleted {
|
||||
stats.CompletedToday++
|
||||
} else if task.Status == TaskStatusFailed {
|
||||
stats.FailedToday++
|
||||
}
|
||||
|
||||
if task.StartedAt != nil {
|
||||
duration := task.CompletedAt.Sub(*task.StartedAt)
|
||||
totalDuration += duration
|
||||
completedTasks++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, worker := range mq.workers {
|
||||
if worker.Status == "active" || worker.Status == "busy" {
|
||||
stats.ActiveWorkers++
|
||||
}
|
||||
}
|
||||
|
||||
if completedTasks > 0 {
|
||||
stats.AverageTaskTime = totalDuration / time.Duration(completedTasks)
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// workerCanHandle checks if a worker can handle a specific task type
|
||||
func (mq *MaintenanceQueue) workerCanHandle(taskType MaintenanceTaskType, capabilities []MaintenanceTaskType) bool {
|
||||
for _, capability := range capabilities {
|
||||
if capability == taskType {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// canScheduleTaskNow determines if a task can be scheduled using task schedulers or fallback logic
|
||||
func (mq *MaintenanceQueue) canScheduleTaskNow(task *MaintenanceTask) bool {
|
||||
// Try task scheduling logic first
|
||||
if mq.integration != nil {
|
||||
// Get all running tasks and available workers
|
||||
runningTasks := mq.getRunningTasks()
|
||||
availableWorkers := mq.getAvailableWorkers()
|
||||
|
||||
canSchedule := mq.integration.CanScheduleWithTaskSchedulers(task, runningTasks, availableWorkers)
|
||||
glog.V(3).Infof("Task scheduler decision for task %s (%s): %v", task.ID, task.Type, canSchedule)
|
||||
return canSchedule
|
||||
}
|
||||
|
||||
// Fallback to hardcoded logic
|
||||
return mq.canExecuteTaskType(task.Type)
|
||||
}
|
||||
|
||||
// canExecuteTaskType checks if we can execute more tasks of this type (concurrency limits) - fallback logic
|
||||
func (mq *MaintenanceQueue) canExecuteTaskType(taskType MaintenanceTaskType) bool {
|
||||
runningCount := mq.GetRunningTaskCount(taskType)
|
||||
maxConcurrent := mq.getMaxConcurrentForTaskType(taskType)
|
||||
|
||||
return runningCount < maxConcurrent
|
||||
}
|
||||
|
||||
// getMaxConcurrentForTaskType returns the maximum concurrent tasks allowed for a task type
|
||||
func (mq *MaintenanceQueue) getMaxConcurrentForTaskType(taskType MaintenanceTaskType) int {
|
||||
// First try to get default from task scheduler
|
||||
if mq.integration != nil {
|
||||
if scheduler := mq.integration.GetTaskScheduler(taskType); scheduler != nil {
|
||||
maxConcurrent := scheduler.GetMaxConcurrent()
|
||||
if maxConcurrent > 0 {
|
||||
glog.V(3).Infof("Using task scheduler max concurrent for %s: %d", taskType, maxConcurrent)
|
||||
return maxConcurrent
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to policy configuration if no scheduler available or scheduler doesn't provide default
|
||||
if mq.policy != nil {
|
||||
maxConcurrent := mq.policy.GetMaxConcurrent(taskType)
|
||||
if maxConcurrent > 0 {
|
||||
glog.V(3).Infof("Using policy configuration max concurrent for %s: %d", taskType, maxConcurrent)
|
||||
return maxConcurrent
|
||||
}
|
||||
}
|
||||
|
||||
// Ultimate fallback - minimal safe default
|
||||
glog.V(2).Infof("No scheduler or policy configuration found for task type %s, using minimal default: 1", taskType)
|
||||
return 1
|
||||
}
|
||||
|
||||
// getRunningTasks returns all currently running tasks
|
||||
func (mq *MaintenanceQueue) getRunningTasks() []*MaintenanceTask {
|
||||
var runningTasks []*MaintenanceTask
|
||||
for _, task := range mq.tasks {
|
||||
if task.Status == TaskStatusAssigned || task.Status == TaskStatusInProgress {
|
||||
runningTasks = append(runningTasks, task)
|
||||
}
|
||||
}
|
||||
return runningTasks
|
||||
}
|
||||
|
||||
// getAvailableWorkers returns all workers that can take more work
|
||||
func (mq *MaintenanceQueue) getAvailableWorkers() []*MaintenanceWorker {
|
||||
var availableWorkers []*MaintenanceWorker
|
||||
for _, worker := range mq.workers {
|
||||
if worker.Status == "active" && worker.CurrentLoad < worker.MaxConcurrent {
|
||||
availableWorkers = append(availableWorkers, worker)
|
||||
}
|
||||
}
|
||||
return availableWorkers
|
||||
}
|
||||
Reference in New Issue
Block a user