♻️ refactor(worker): decouple state management using command-query pattern (#7354)
* ♻️ refactor(worker): decouple state management using command-query pattern This commit eliminates all uses of sync.Mutex across the `worker.go` and `client.go` components, changing how mutable state is accessed and modified. Single Owner Principle is now enforced. - Guarantees thread safety and prevents data races by ensuring that only one goroutine ever modifies or reads state. Impact: Improves application concurrency, reliability, and maintainability by isolating state concerns. * 🐛 fix(worker): fix race condition when closing The use of select/default is wrong for mandatory shutdown signals. * 🐛 fix(worker): do not get tickers in every iteration * 🐛 fix(worker): fix race condition when closing pt 2 refactor `handleOutgoing` to mirror the non-blocking logic of `handleIncoming` * address comments * To ensure stream errors are always processed, the send should be blocking. * avoid blocking the manager loop while waiting for tasks to complete --------- Co-authored-by: Chris Lu <chrislusf@users.noreply.github.com> Co-authored-by: Chris Lu <chris.lu@gmail.com>
This commit is contained in:
committed by
GitHub
parent
f7bd75ef3b
commit
fa025dc96f
File diff suppressed because it is too large
Load Diff
@@ -23,20 +23,56 @@ import (
|
|||||||
|
|
||||||
// Worker represents a maintenance worker instance
|
// Worker represents a maintenance worker instance
|
||||||
type Worker struct {
|
type Worker struct {
|
||||||
id string
|
id string
|
||||||
config *types.WorkerConfig
|
config *types.WorkerConfig
|
||||||
registry *tasks.TaskRegistry
|
registry *tasks.TaskRegistry
|
||||||
currentTasks map[string]*types.TaskInput
|
cmds chan workerCommand
|
||||||
adminClient AdminClient
|
state *workerState
|
||||||
|
taskLogHandler *tasks.TaskLogHandler
|
||||||
|
closeOnce sync.Once
|
||||||
|
}
|
||||||
|
type workerState struct {
|
||||||
running bool
|
running bool
|
||||||
stopChan chan struct{}
|
adminClient AdminClient
|
||||||
mutex sync.RWMutex
|
|
||||||
startTime time.Time
|
startTime time.Time
|
||||||
tasksCompleted int
|
stopChan chan struct{}
|
||||||
tasksFailed int
|
|
||||||
heartbeatTicker *time.Ticker
|
heartbeatTicker *time.Ticker
|
||||||
requestTicker *time.Ticker
|
requestTicker *time.Ticker
|
||||||
taskLogHandler *tasks.TaskLogHandler
|
currentTasks map[string]*types.TaskInput
|
||||||
|
tasksCompleted int
|
||||||
|
tasksFailed int
|
||||||
|
}
|
||||||
|
|
||||||
|
type workerAction string
|
||||||
|
|
||||||
|
const (
|
||||||
|
ActionStart workerAction = "start"
|
||||||
|
ActionStop workerAction = "stop"
|
||||||
|
ActionGetStatus workerAction = "getstatus"
|
||||||
|
ActionGetTaskLoad workerAction = "getload"
|
||||||
|
ActionSetTask workerAction = "settask"
|
||||||
|
ActionSetAdmin workerAction = "setadmin"
|
||||||
|
ActionRemoveTask workerAction = "removetask"
|
||||||
|
ActionGetAdmin workerAction = "getadmin"
|
||||||
|
ActionIncTaskFail workerAction = "inctaskfail"
|
||||||
|
ActionIncTaskComplete workerAction = "inctaskcomplete"
|
||||||
|
ActionGetHbTick workerAction = "gethbtick"
|
||||||
|
ActionGetReqTick workerAction = "getreqtick"
|
||||||
|
ActionGetStopChan workerAction = "getstopchan"
|
||||||
|
ActionSetHbTick workerAction = "sethbtick"
|
||||||
|
ActionSetReqTick workerAction = "setreqtick"
|
||||||
|
ActionGetStartTime workerAction = "getstarttime"
|
||||||
|
ActionGetCompletedTasks workerAction = "getcompletedtasks"
|
||||||
|
ActionGetFailedTasks workerAction = "getfailedtasks"
|
||||||
|
ActionCancelTask workerAction = "canceltask"
|
||||||
|
// ... other worker actions like Stop, Status, etc.
|
||||||
|
)
|
||||||
|
|
||||||
|
type statusResponse chan types.WorkerStatus
|
||||||
|
type workerCommand struct {
|
||||||
|
action workerAction
|
||||||
|
data any
|
||||||
|
resp chan error // for reporting success/failure
|
||||||
}
|
}
|
||||||
|
|
||||||
// AdminClient defines the interface for communicating with the admin server
|
// AdminClient defines the interface for communicating with the admin server
|
||||||
@@ -150,17 +186,222 @@ func NewWorker(config *types.WorkerConfig) (*Worker, error) {
|
|||||||
id: workerID,
|
id: workerID,
|
||||||
config: config,
|
config: config,
|
||||||
registry: registry,
|
registry: registry,
|
||||||
currentTasks: make(map[string]*types.TaskInput),
|
|
||||||
stopChan: make(chan struct{}),
|
|
||||||
startTime: time.Now(),
|
|
||||||
taskLogHandler: taskLogHandler,
|
taskLogHandler: taskLogHandler,
|
||||||
|
cmds: make(chan workerCommand),
|
||||||
}
|
}
|
||||||
|
|
||||||
glog.V(1).Infof("Worker created with %d registered task types", len(registry.GetAll()))
|
glog.V(1).Infof("Worker created with %d registered task types", len(registry.GetAll()))
|
||||||
|
go worker.managerLoop()
|
||||||
return worker, nil
|
return worker, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (w *Worker) managerLoop() {
|
||||||
|
w.state = &workerState{
|
||||||
|
startTime: time.Now(),
|
||||||
|
stopChan: make(chan struct{}),
|
||||||
|
currentTasks: make(map[string]*types.TaskInput),
|
||||||
|
}
|
||||||
|
|
||||||
|
for cmd := range w.cmds {
|
||||||
|
switch cmd.action {
|
||||||
|
case ActionStart:
|
||||||
|
w.handleStart(cmd)
|
||||||
|
case ActionStop:
|
||||||
|
w.handleStop(cmd)
|
||||||
|
case ActionGetStatus:
|
||||||
|
respCh := cmd.data.(statusResponse)
|
||||||
|
var currentTasks []types.TaskInput
|
||||||
|
for _, task := range w.state.currentTasks {
|
||||||
|
currentTasks = append(currentTasks, *task)
|
||||||
|
}
|
||||||
|
|
||||||
|
statusStr := "active"
|
||||||
|
if len(w.state.currentTasks) >= w.config.MaxConcurrent {
|
||||||
|
statusStr = "busy"
|
||||||
|
}
|
||||||
|
|
||||||
|
status := types.WorkerStatus{
|
||||||
|
WorkerID: w.id,
|
||||||
|
Status: statusStr,
|
||||||
|
Capabilities: w.config.Capabilities,
|
||||||
|
MaxConcurrent: w.config.MaxConcurrent,
|
||||||
|
CurrentLoad: len(w.state.currentTasks),
|
||||||
|
LastHeartbeat: time.Now(),
|
||||||
|
CurrentTasks: currentTasks,
|
||||||
|
Uptime: time.Since(w.state.startTime),
|
||||||
|
TasksCompleted: w.state.tasksCompleted,
|
||||||
|
TasksFailed: w.state.tasksFailed,
|
||||||
|
}
|
||||||
|
respCh <- status
|
||||||
|
case ActionGetTaskLoad:
|
||||||
|
respCh := cmd.data.(chan int)
|
||||||
|
respCh <- len(w.state.currentTasks)
|
||||||
|
case ActionSetTask:
|
||||||
|
currentLoad := len(w.state.currentTasks)
|
||||||
|
if currentLoad >= w.config.MaxConcurrent {
|
||||||
|
cmd.resp <- fmt.Errorf("worker is at capacity")
|
||||||
|
}
|
||||||
|
task := cmd.data.(*types.TaskInput)
|
||||||
|
w.state.currentTasks[task.ID] = task
|
||||||
|
cmd.resp <- nil
|
||||||
|
case ActionSetAdmin:
|
||||||
|
admin := cmd.data.(AdminClient)
|
||||||
|
w.state.adminClient = admin
|
||||||
|
case ActionRemoveTask:
|
||||||
|
taskID := cmd.data.(string)
|
||||||
|
delete(w.state.currentTasks, taskID)
|
||||||
|
case ActionGetAdmin:
|
||||||
|
respCh := cmd.data.(chan AdminClient)
|
||||||
|
respCh <- w.state.adminClient
|
||||||
|
case ActionIncTaskFail:
|
||||||
|
w.state.tasksFailed++
|
||||||
|
case ActionIncTaskComplete:
|
||||||
|
w.state.tasksCompleted++
|
||||||
|
case ActionGetHbTick:
|
||||||
|
respCh := cmd.data.(chan *time.Ticker)
|
||||||
|
respCh <- w.state.heartbeatTicker
|
||||||
|
case ActionGetReqTick:
|
||||||
|
respCh := cmd.data.(chan *time.Ticker)
|
||||||
|
respCh <- w.state.requestTicker
|
||||||
|
case ActionSetHbTick:
|
||||||
|
w.state.heartbeatTicker = cmd.data.(*time.Ticker)
|
||||||
|
case ActionSetReqTick:
|
||||||
|
w.state.requestTicker = cmd.data.(*time.Ticker)
|
||||||
|
case ActionGetStopChan:
|
||||||
|
cmd.data.(chan chan struct{}) <- w.state.stopChan
|
||||||
|
case ActionGetStartTime:
|
||||||
|
cmd.data.(chan time.Time) <- w.state.startTime
|
||||||
|
case ActionGetCompletedTasks:
|
||||||
|
cmd.data.(chan int) <- w.state.tasksCompleted
|
||||||
|
case ActionGetFailedTasks:
|
||||||
|
cmd.data.(chan int) <- w.state.tasksFailed
|
||||||
|
case ActionCancelTask:
|
||||||
|
taskID := cmd.data.(string)
|
||||||
|
if task, exists := w.state.currentTasks[taskID]; exists {
|
||||||
|
glog.Infof("Cancelling task %s", task.ID)
|
||||||
|
// TODO: Implement actual task cancellation logic
|
||||||
|
} else {
|
||||||
|
glog.Warningf("Cannot cancel task %s: task not found", taskID)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) getTaskLoad() int {
|
||||||
|
respCh := make(chan int, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetTaskLoad,
|
||||||
|
data: respCh,
|
||||||
|
resp: nil,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) setTask(task *types.TaskInput) error {
|
||||||
|
resp := make(chan error)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionSetTask,
|
||||||
|
data: task,
|
||||||
|
resp: resp,
|
||||||
|
}
|
||||||
|
if err := <-resp; err != nil {
|
||||||
|
glog.Errorf("TASK REJECTED: Worker %s at capacity (%d/%d) - rejecting task %s",
|
||||||
|
w.id, w.getTaskLoad(), w.config.MaxConcurrent, task.ID)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
newLoad := w.getTaskLoad()
|
||||||
|
|
||||||
|
glog.Infof("TASK ACCEPTED: Worker %s accepted task %s - current load: %d/%d",
|
||||||
|
w.id, task.ID, newLoad, w.config.MaxConcurrent)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) removeTask(task *types.TaskInput) int {
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionRemoveTask,
|
||||||
|
data: task.ID,
|
||||||
|
}
|
||||||
|
return w.getTaskLoad()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) getAdmin() AdminClient {
|
||||||
|
respCh := make(chan AdminClient, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetAdmin,
|
||||||
|
data: respCh,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) getStopChan() chan struct{} {
|
||||||
|
respCh := make(chan chan struct{}, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetStopChan,
|
||||||
|
data: respCh,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) getHbTick() *time.Ticker {
|
||||||
|
respCh := make(chan *time.Ticker, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetHbTick,
|
||||||
|
data: respCh,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) getReqTick() *time.Ticker {
|
||||||
|
respCh := make(chan *time.Ticker, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetReqTick,
|
||||||
|
data: respCh,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) setHbTick(tick *time.Ticker) *time.Ticker {
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionSetHbTick,
|
||||||
|
data: tick,
|
||||||
|
}
|
||||||
|
return w.getHbTick()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) setReqTick(tick *time.Ticker) *time.Ticker {
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionSetReqTick,
|
||||||
|
data: tick,
|
||||||
|
}
|
||||||
|
return w.getReqTick()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) getStartTime() time.Time {
|
||||||
|
respCh := make(chan time.Time, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetStartTime,
|
||||||
|
data: respCh,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
func (w *Worker) getCompletedTasks() int {
|
||||||
|
respCh := make(chan int, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetCompletedTasks,
|
||||||
|
data: respCh,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
func (w *Worker) getFailedTasks() int {
|
||||||
|
respCh := make(chan int, 1)
|
||||||
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetFailedTasks,
|
||||||
|
data: respCh,
|
||||||
|
}
|
||||||
|
return <-respCh
|
||||||
|
}
|
||||||
|
|
||||||
// getTaskLoggerConfig returns the task logger configuration with worker's log directory
|
// getTaskLoggerConfig returns the task logger configuration with worker's log directory
|
||||||
func (w *Worker) getTaskLoggerConfig() tasks.TaskLoggerConfig {
|
func (w *Worker) getTaskLoggerConfig() tasks.TaskLoggerConfig {
|
||||||
config := tasks.DefaultTaskLoggerConfig()
|
config := tasks.DefaultTaskLoggerConfig()
|
||||||
@@ -177,21 +418,29 @@ func (w *Worker) ID() string {
|
|||||||
return w.id
|
return w.id
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start starts the worker
|
|
||||||
func (w *Worker) Start() error {
|
func (w *Worker) Start() error {
|
||||||
w.mutex.Lock()
|
resp := make(chan error)
|
||||||
defer w.mutex.Unlock()
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionStart,
|
||||||
|
resp: resp,
|
||||||
|
}
|
||||||
|
return <-resp
|
||||||
|
}
|
||||||
|
|
||||||
if w.running {
|
// Start starts the worker
|
||||||
return fmt.Errorf("worker is already running")
|
func (w *Worker) handleStart(cmd workerCommand) {
|
||||||
|
if w.state.running {
|
||||||
|
cmd.resp <- fmt.Errorf("worker is already running")
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if w.adminClient == nil {
|
if w.state.adminClient == nil {
|
||||||
return fmt.Errorf("admin client is not set")
|
cmd.resp <- fmt.Errorf("admin client is not set")
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
w.running = true
|
w.state.running = true
|
||||||
w.startTime = time.Now()
|
w.state.startTime = time.Now()
|
||||||
|
|
||||||
// Prepare worker info for registration
|
// Prepare worker info for registration
|
||||||
workerInfo := &types.WorkerData{
|
workerInfo := &types.WorkerData{
|
||||||
@@ -204,7 +453,7 @@ func (w *Worker) Start() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Register worker info with client first (this stores it for use during connection)
|
// Register worker info with client first (this stores it for use during connection)
|
||||||
if err := w.adminClient.RegisterWorker(workerInfo); err != nil {
|
if err := w.state.adminClient.RegisterWorker(workerInfo); err != nil {
|
||||||
glog.V(1).Infof("Worker info stored for registration: %v", err)
|
glog.V(1).Infof("Worker info stored for registration: %v", err)
|
||||||
// This is expected if not connected yet
|
// This is expected if not connected yet
|
||||||
}
|
}
|
||||||
@@ -214,7 +463,7 @@ func (w *Worker) Start() error {
|
|||||||
w.id, w.config.Capabilities, w.config.MaxConcurrent)
|
w.id, w.config.Capabilities, w.config.MaxConcurrent)
|
||||||
|
|
||||||
// Try initial connection, but don't fail if it doesn't work immediately
|
// Try initial connection, but don't fail if it doesn't work immediately
|
||||||
if err := w.adminClient.Connect(); err != nil {
|
if err := w.state.adminClient.Connect(); err != nil {
|
||||||
glog.Warningf("INITIAL CONNECTION FAILED: Worker %s initial connection to admin server failed, will keep retrying: %v", w.id, err)
|
glog.Warningf("INITIAL CONNECTION FAILED: Worker %s initial connection to admin server failed, will keep retrying: %v", w.id, err)
|
||||||
// Don't return error - let the reconnection loop handle it
|
// Don't return error - let the reconnection loop handle it
|
||||||
} else {
|
} else {
|
||||||
@@ -230,54 +479,67 @@ func (w *Worker) Start() error {
|
|||||||
go w.messageProcessingLoop()
|
go w.messageProcessingLoop()
|
||||||
|
|
||||||
glog.Infof("WORKER STARTED: Worker %s started successfully (connection attempts will continue in background)", w.id)
|
glog.Infof("WORKER STARTED: Worker %s started successfully (connection attempts will continue in background)", w.id)
|
||||||
return nil
|
cmd.resp <- nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop stops the worker
|
|
||||||
func (w *Worker) Stop() error {
|
func (w *Worker) Stop() error {
|
||||||
w.mutex.Lock()
|
resp := make(chan error)
|
||||||
defer w.mutex.Unlock()
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionStop,
|
||||||
if !w.running {
|
resp: resp,
|
||||||
return nil
|
}
|
||||||
|
if err := <-resp; err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
w.running = false
|
// Wait for tasks to finish
|
||||||
close(w.stopChan)
|
|
||||||
|
|
||||||
// Stop tickers
|
|
||||||
if w.heartbeatTicker != nil {
|
|
||||||
w.heartbeatTicker.Stop()
|
|
||||||
}
|
|
||||||
if w.requestTicker != nil {
|
|
||||||
w.requestTicker.Stop()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for current tasks to complete or timeout
|
|
||||||
timeout := time.NewTimer(30 * time.Second)
|
timeout := time.NewTimer(30 * time.Second)
|
||||||
defer timeout.Stop()
|
defer timeout.Stop()
|
||||||
|
for w.getTaskLoad() > 0 {
|
||||||
for len(w.currentTasks) > 0 {
|
|
||||||
select {
|
select {
|
||||||
case <-timeout.C:
|
case <-timeout.C:
|
||||||
glog.Warningf("Worker %s stopping with %d tasks still running", w.id, len(w.currentTasks))
|
glog.Warningf("Worker %s stopping with %d tasks still running", w.id, w.getTaskLoad())
|
||||||
break
|
goto end_wait
|
||||||
case <-time.After(time.Second):
|
case <-time.After(100 * time.Millisecond):
|
||||||
// Check again
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
end_wait:
|
||||||
|
|
||||||
// Disconnect from admin server
|
// Disconnect from admin server
|
||||||
if w.adminClient != nil {
|
if adminClient := w.getAdmin(); adminClient != nil {
|
||||||
if err := w.adminClient.Disconnect(); err != nil {
|
if err := adminClient.Disconnect(); err != nil {
|
||||||
glog.Errorf("Error disconnecting from admin server: %v", err)
|
glog.Errorf("Error disconnecting from admin server: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
w.closeOnce.Do(func() {
|
||||||
|
close(w.cmds)
|
||||||
|
})
|
||||||
glog.Infof("Worker %s stopped", w.id)
|
glog.Infof("Worker %s stopped", w.id)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Stop stops the worker
|
||||||
|
func (w *Worker) handleStop(cmd workerCommand) {
|
||||||
|
if !w.state.running {
|
||||||
|
cmd.resp <- nil
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
w.state.running = false
|
||||||
|
close(w.state.stopChan)
|
||||||
|
|
||||||
|
// Stop tickers
|
||||||
|
if w.state.heartbeatTicker != nil {
|
||||||
|
w.state.heartbeatTicker.Stop()
|
||||||
|
}
|
||||||
|
if w.state.requestTicker != nil {
|
||||||
|
w.state.requestTicker.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd.resp <- nil
|
||||||
|
}
|
||||||
|
|
||||||
// RegisterTask registers a task factory
|
// RegisterTask registers a task factory
|
||||||
func (w *Worker) RegisterTask(taskType types.TaskType, factory types.TaskFactory) {
|
func (w *Worker) RegisterTask(taskType types.TaskType, factory types.TaskFactory) {
|
||||||
w.registry.Register(taskType, factory)
|
w.registry.Register(taskType, factory)
|
||||||
@@ -290,31 +552,13 @@ func (w *Worker) GetCapabilities() []types.TaskType {
|
|||||||
|
|
||||||
// GetStatus returns the current worker status
|
// GetStatus returns the current worker status
|
||||||
func (w *Worker) GetStatus() types.WorkerStatus {
|
func (w *Worker) GetStatus() types.WorkerStatus {
|
||||||
w.mutex.RLock()
|
respCh := make(statusResponse, 1)
|
||||||
defer w.mutex.RUnlock()
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionGetStatus,
|
||||||
var currentTasks []types.TaskInput
|
data: respCh,
|
||||||
for _, task := range w.currentTasks {
|
resp: nil,
|
||||||
currentTasks = append(currentTasks, *task)
|
|
||||||
}
|
|
||||||
|
|
||||||
status := "active"
|
|
||||||
if len(w.currentTasks) >= w.config.MaxConcurrent {
|
|
||||||
status = "busy"
|
|
||||||
}
|
|
||||||
|
|
||||||
return types.WorkerStatus{
|
|
||||||
WorkerID: w.id,
|
|
||||||
Status: status,
|
|
||||||
Capabilities: w.config.Capabilities,
|
|
||||||
MaxConcurrent: w.config.MaxConcurrent,
|
|
||||||
CurrentLoad: len(w.currentTasks),
|
|
||||||
LastHeartbeat: time.Now(),
|
|
||||||
CurrentTasks: currentTasks,
|
|
||||||
Uptime: time.Since(w.startTime),
|
|
||||||
TasksCompleted: w.tasksCompleted,
|
|
||||||
TasksFailed: w.tasksFailed,
|
|
||||||
}
|
}
|
||||||
|
return <-respCh
|
||||||
}
|
}
|
||||||
|
|
||||||
// HandleTask handles a task execution
|
// HandleTask handles a task execution
|
||||||
@@ -322,22 +566,10 @@ func (w *Worker) HandleTask(task *types.TaskInput) error {
|
|||||||
glog.V(1).Infof("Worker %s received task %s (type: %s, volume: %d)",
|
glog.V(1).Infof("Worker %s received task %s (type: %s, volume: %d)",
|
||||||
w.id, task.ID, task.Type, task.VolumeID)
|
w.id, task.ID, task.Type, task.VolumeID)
|
||||||
|
|
||||||
w.mutex.Lock()
|
if err := w.setTask(task); err != nil {
|
||||||
currentLoad := len(w.currentTasks)
|
return err
|
||||||
if currentLoad >= w.config.MaxConcurrent {
|
|
||||||
w.mutex.Unlock()
|
|
||||||
glog.Errorf("TASK REJECTED: Worker %s at capacity (%d/%d) - rejecting task %s",
|
|
||||||
w.id, currentLoad, w.config.MaxConcurrent, task.ID)
|
|
||||||
return fmt.Errorf("worker is at capacity")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
w.currentTasks[task.ID] = task
|
|
||||||
newLoad := len(w.currentTasks)
|
|
||||||
w.mutex.Unlock()
|
|
||||||
|
|
||||||
glog.Infof("TASK ACCEPTED: Worker %s accepted task %s - current load: %d/%d",
|
|
||||||
w.id, task.ID, newLoad, w.config.MaxConcurrent)
|
|
||||||
|
|
||||||
// Execute task in goroutine
|
// Execute task in goroutine
|
||||||
go w.executeTask(task)
|
go w.executeTask(task)
|
||||||
|
|
||||||
@@ -366,7 +598,10 @@ func (w *Worker) SetTaskRequestInterval(interval time.Duration) {
|
|||||||
|
|
||||||
// SetAdminClient sets the admin client
|
// SetAdminClient sets the admin client
|
||||||
func (w *Worker) SetAdminClient(client AdminClient) {
|
func (w *Worker) SetAdminClient(client AdminClient) {
|
||||||
w.adminClient = client
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionSetAdmin,
|
||||||
|
data: client,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// executeTask executes a task
|
// executeTask executes a task
|
||||||
@@ -374,10 +609,7 @@ func (w *Worker) executeTask(task *types.TaskInput) {
|
|||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
w.mutex.Lock()
|
currentLoad := w.removeTask(task)
|
||||||
delete(w.currentTasks, task.ID)
|
|
||||||
currentLoad := len(w.currentTasks)
|
|
||||||
w.mutex.Unlock()
|
|
||||||
|
|
||||||
duration := time.Since(startTime)
|
duration := time.Since(startTime)
|
||||||
glog.Infof("TASK EXECUTION FINISHED: Worker %s finished executing task %s after %v - current load: %d/%d",
|
glog.Infof("TASK EXECUTION FINISHED: Worker %s finished executing task %s after %v - current load: %d/%d",
|
||||||
@@ -388,7 +620,7 @@ func (w *Worker) executeTask(task *types.TaskInput) {
|
|||||||
w.id, task.ID, task.Type, task.VolumeID, task.Server, task.Collection, startTime.Format(time.RFC3339))
|
w.id, task.ID, task.Type, task.VolumeID, task.Server, task.Collection, startTime.Format(time.RFC3339))
|
||||||
|
|
||||||
// Report task start to admin server
|
// Report task start to admin server
|
||||||
if err := w.adminClient.UpdateTaskProgress(task.ID, 0.0); err != nil {
|
if err := w.getAdmin().UpdateTaskProgress(task.ID, 0.0); err != nil {
|
||||||
glog.V(1).Infof("Failed to report task start to admin: %v", err)
|
glog.V(1).Infof("Failed to report task start to admin: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -461,7 +693,7 @@ func (w *Worker) executeTask(task *types.TaskInput) {
|
|||||||
taskInstance.SetProgressCallback(func(progress float64, stage string) {
|
taskInstance.SetProgressCallback(func(progress float64, stage string) {
|
||||||
// Report progress updates to admin server
|
// Report progress updates to admin server
|
||||||
glog.V(2).Infof("Task %s progress: %.1f%% - %s", task.ID, progress, stage)
|
glog.V(2).Infof("Task %s progress: %.1f%% - %s", task.ID, progress, stage)
|
||||||
if err := w.adminClient.UpdateTaskProgress(task.ID, progress); err != nil {
|
if err := w.getAdmin().UpdateTaskProgress(task.ID, progress); err != nil {
|
||||||
glog.V(1).Infof("Failed to report task progress to admin: %v", err)
|
glog.V(1).Infof("Failed to report task progress to admin: %v", err)
|
||||||
}
|
}
|
||||||
if fileLogger != nil {
|
if fileLogger != nil {
|
||||||
@@ -481,7 +713,9 @@ func (w *Worker) executeTask(task *types.TaskInput) {
|
|||||||
// Report completion
|
// Report completion
|
||||||
if err != nil {
|
if err != nil {
|
||||||
w.completeTask(task.ID, false, err.Error())
|
w.completeTask(task.ID, false, err.Error())
|
||||||
w.tasksFailed++
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionIncTaskFail,
|
||||||
|
}
|
||||||
glog.Errorf("Worker %s failed to execute task %s: %v", w.id, task.ID, err)
|
glog.Errorf("Worker %s failed to execute task %s: %v", w.id, task.ID, err)
|
||||||
if fileLogger != nil {
|
if fileLogger != nil {
|
||||||
fileLogger.LogStatus("failed", err.Error())
|
fileLogger.LogStatus("failed", err.Error())
|
||||||
@@ -489,18 +723,21 @@ func (w *Worker) executeTask(task *types.TaskInput) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
w.completeTask(task.ID, true, "")
|
w.completeTask(task.ID, true, "")
|
||||||
w.tasksCompleted++
|
w.cmds <- workerCommand{
|
||||||
|
action: ActionIncTaskComplete,
|
||||||
|
}
|
||||||
glog.Infof("Worker %s completed task %s successfully", w.id, task.ID)
|
glog.Infof("Worker %s completed task %s successfully", w.id, task.ID)
|
||||||
if fileLogger != nil {
|
if fileLogger != nil {
|
||||||
fileLogger.Info("Task %s completed successfully", task.ID)
|
fileLogger.Info("Task %s completed successfully", task.ID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// completeTask reports task completion to admin server
|
// completeTask reports task completion to admin server
|
||||||
func (w *Worker) completeTask(taskID string, success bool, errorMsg string) {
|
func (w *Worker) completeTask(taskID string, success bool, errorMsg string) {
|
||||||
if w.adminClient != nil {
|
if w.getAdmin() != nil {
|
||||||
if err := w.adminClient.CompleteTask(taskID, success, errorMsg); err != nil {
|
if err := w.getAdmin().CompleteTask(taskID, success, errorMsg); err != nil {
|
||||||
glog.Errorf("Failed to report task completion: %v", err)
|
glog.Errorf("Failed to report task completion: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -508,14 +745,14 @@ func (w *Worker) completeTask(taskID string, success bool, errorMsg string) {
|
|||||||
|
|
||||||
// heartbeatLoop sends periodic heartbeats to the admin server
|
// heartbeatLoop sends periodic heartbeats to the admin server
|
||||||
func (w *Worker) heartbeatLoop() {
|
func (w *Worker) heartbeatLoop() {
|
||||||
w.heartbeatTicker = time.NewTicker(w.config.HeartbeatInterval)
|
defer w.setHbTick(time.NewTicker(w.config.HeartbeatInterval)).Stop()
|
||||||
defer w.heartbeatTicker.Stop()
|
ticker := w.getHbTick()
|
||||||
|
stopChan := w.getStopChan()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-w.stopChan:
|
case <-stopChan:
|
||||||
return
|
return
|
||||||
case <-w.heartbeatTicker.C:
|
case <-ticker.C:
|
||||||
w.sendHeartbeat()
|
w.sendHeartbeat()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -523,14 +760,14 @@ func (w *Worker) heartbeatLoop() {
|
|||||||
|
|
||||||
// taskRequestLoop periodically requests new tasks from the admin server
|
// taskRequestLoop periodically requests new tasks from the admin server
|
||||||
func (w *Worker) taskRequestLoop() {
|
func (w *Worker) taskRequestLoop() {
|
||||||
w.requestTicker = time.NewTicker(w.config.TaskRequestInterval)
|
defer w.setReqTick(time.NewTicker(w.config.TaskRequestInterval)).Stop()
|
||||||
defer w.requestTicker.Stop()
|
ticker := w.getReqTick()
|
||||||
|
stopChan := w.getStopChan()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-w.stopChan:
|
case <-stopChan:
|
||||||
return
|
return
|
||||||
case <-w.requestTicker.C:
|
case <-ticker.C:
|
||||||
w.requestTasks()
|
w.requestTasks()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -538,13 +775,13 @@ func (w *Worker) taskRequestLoop() {
|
|||||||
|
|
||||||
// sendHeartbeat sends heartbeat to admin server
|
// sendHeartbeat sends heartbeat to admin server
|
||||||
func (w *Worker) sendHeartbeat() {
|
func (w *Worker) sendHeartbeat() {
|
||||||
if w.adminClient != nil {
|
if w.getAdmin() != nil {
|
||||||
if err := w.adminClient.SendHeartbeat(w.id, &types.WorkerStatus{
|
if err := w.getAdmin().SendHeartbeat(w.id, &types.WorkerStatus{
|
||||||
WorkerID: w.id,
|
WorkerID: w.id,
|
||||||
Status: "active",
|
Status: "active",
|
||||||
Capabilities: w.config.Capabilities,
|
Capabilities: w.config.Capabilities,
|
||||||
MaxConcurrent: w.config.MaxConcurrent,
|
MaxConcurrent: w.config.MaxConcurrent,
|
||||||
CurrentLoad: len(w.currentTasks),
|
CurrentLoad: w.getTaskLoad(),
|
||||||
LastHeartbeat: time.Now(),
|
LastHeartbeat: time.Now(),
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
glog.Warningf("Failed to send heartbeat: %v", err)
|
glog.Warningf("Failed to send heartbeat: %v", err)
|
||||||
@@ -554,9 +791,7 @@ func (w *Worker) sendHeartbeat() {
|
|||||||
|
|
||||||
// requestTasks requests new tasks from the admin server
|
// requestTasks requests new tasks from the admin server
|
||||||
func (w *Worker) requestTasks() {
|
func (w *Worker) requestTasks() {
|
||||||
w.mutex.RLock()
|
currentLoad := w.getTaskLoad()
|
||||||
currentLoad := len(w.currentTasks)
|
|
||||||
w.mutex.RUnlock()
|
|
||||||
|
|
||||||
if currentLoad >= w.config.MaxConcurrent {
|
if currentLoad >= w.config.MaxConcurrent {
|
||||||
glog.V(3).Infof("TASK REQUEST SKIPPED: Worker %s at capacity (%d/%d)",
|
glog.V(3).Infof("TASK REQUEST SKIPPED: Worker %s at capacity (%d/%d)",
|
||||||
@@ -564,11 +799,11 @@ func (w *Worker) requestTasks() {
|
|||||||
return // Already at capacity
|
return // Already at capacity
|
||||||
}
|
}
|
||||||
|
|
||||||
if w.adminClient != nil {
|
if w.getAdmin() != nil {
|
||||||
glog.V(3).Infof("REQUESTING TASK: Worker %s requesting task from admin server (current load: %d/%d, capabilities: %v)",
|
glog.V(3).Infof("REQUESTING TASK: Worker %s requesting task from admin server (current load: %d/%d, capabilities: %v)",
|
||||||
w.id, currentLoad, w.config.MaxConcurrent, w.config.Capabilities)
|
w.id, currentLoad, w.config.MaxConcurrent, w.config.Capabilities)
|
||||||
|
|
||||||
task, err := w.adminClient.RequestTask(w.id, w.config.Capabilities)
|
task, err := w.getAdmin().RequestTask(w.id, w.config.Capabilities)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.V(2).Infof("TASK REQUEST FAILED: Worker %s failed to request task: %v", w.id, err)
|
glog.V(2).Infof("TASK REQUEST FAILED: Worker %s failed to request task: %v", w.id, err)
|
||||||
return
|
return
|
||||||
@@ -591,18 +826,6 @@ func (w *Worker) GetTaskRegistry() *tasks.TaskRegistry {
|
|||||||
return w.registry
|
return w.registry
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetCurrentTasks returns the current tasks
|
|
||||||
func (w *Worker) GetCurrentTasks() map[string]*types.TaskInput {
|
|
||||||
w.mutex.RLock()
|
|
||||||
defer w.mutex.RUnlock()
|
|
||||||
|
|
||||||
tasks := make(map[string]*types.TaskInput)
|
|
||||||
for id, task := range w.currentTasks {
|
|
||||||
tasks[id] = task
|
|
||||||
}
|
|
||||||
return tasks
|
|
||||||
}
|
|
||||||
|
|
||||||
// registerWorker registers the worker with the admin server
|
// registerWorker registers the worker with the admin server
|
||||||
func (w *Worker) registerWorker() {
|
func (w *Worker) registerWorker() {
|
||||||
workerInfo := &types.WorkerData{
|
workerInfo := &types.WorkerData{
|
||||||
@@ -614,7 +837,7 @@ func (w *Worker) registerWorker() {
|
|||||||
LastHeartbeat: time.Now(),
|
LastHeartbeat: time.Now(),
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := w.adminClient.RegisterWorker(workerInfo); err != nil {
|
if err := w.getAdmin().RegisterWorker(workerInfo); err != nil {
|
||||||
glog.Warningf("Failed to register worker (will retry on next heartbeat): %v", err)
|
glog.Warningf("Failed to register worker (will retry on next heartbeat): %v", err)
|
||||||
} else {
|
} else {
|
||||||
glog.Infof("Worker %s registered successfully with admin server", w.id)
|
glog.Infof("Worker %s registered successfully with admin server", w.id)
|
||||||
@@ -627,15 +850,15 @@ func (w *Worker) connectionMonitorLoop() {
|
|||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
lastConnectionStatus := false
|
lastConnectionStatus := false
|
||||||
|
stopChan := w.getStopChan()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-w.stopChan:
|
case <-stopChan:
|
||||||
glog.V(1).Infof("CONNECTION MONITOR STOPPING: Worker %s connection monitor loop stopping", w.id)
|
glog.V(1).Infof("CONNECTION MONITOR STOPPING: Worker %s connection monitor loop stopping", w.id)
|
||||||
return
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
// Monitor connection status and log changes
|
// Monitor connection status and log changes
|
||||||
currentConnectionStatus := w.adminClient != nil && w.adminClient.IsConnected()
|
currentConnectionStatus := w.getAdmin() != nil && w.getAdmin().IsConnected()
|
||||||
|
|
||||||
if currentConnectionStatus != lastConnectionStatus {
|
if currentConnectionStatus != lastConnectionStatus {
|
||||||
if currentConnectionStatus {
|
if currentConnectionStatus {
|
||||||
@@ -662,19 +885,17 @@ func (w *Worker) GetConfig() *types.WorkerConfig {
|
|||||||
|
|
||||||
// GetPerformanceMetrics returns performance metrics
|
// GetPerformanceMetrics returns performance metrics
|
||||||
func (w *Worker) GetPerformanceMetrics() *types.WorkerPerformance {
|
func (w *Worker) GetPerformanceMetrics() *types.WorkerPerformance {
|
||||||
w.mutex.RLock()
|
|
||||||
defer w.mutex.RUnlock()
|
|
||||||
|
|
||||||
uptime := time.Since(w.startTime)
|
uptime := time.Since(w.getStartTime())
|
||||||
var successRate float64
|
var successRate float64
|
||||||
totalTasks := w.tasksCompleted + w.tasksFailed
|
totalTasks := w.getCompletedTasks() + w.getFailedTasks()
|
||||||
if totalTasks > 0 {
|
if totalTasks > 0 {
|
||||||
successRate = float64(w.tasksCompleted) / float64(totalTasks) * 100
|
successRate = float64(w.getCompletedTasks()) / float64(totalTasks) * 100
|
||||||
}
|
}
|
||||||
|
|
||||||
return &types.WorkerPerformance{
|
return &types.WorkerPerformance{
|
||||||
TasksCompleted: w.tasksCompleted,
|
TasksCompleted: w.getCompletedTasks(),
|
||||||
TasksFailed: w.tasksFailed,
|
TasksFailed: w.getFailedTasks(),
|
||||||
AverageTaskTime: 0, // Would need to track this
|
AverageTaskTime: 0, // Would need to track this
|
||||||
Uptime: uptime,
|
Uptime: uptime,
|
||||||
SuccessRate: successRate,
|
SuccessRate: successRate,
|
||||||
@@ -686,7 +907,7 @@ func (w *Worker) messageProcessingLoop() {
|
|||||||
glog.Infof("MESSAGE LOOP STARTED: Worker %s message processing loop started", w.id)
|
glog.Infof("MESSAGE LOOP STARTED: Worker %s message processing loop started", w.id)
|
||||||
|
|
||||||
// Get access to the incoming message channel from gRPC client
|
// Get access to the incoming message channel from gRPC client
|
||||||
grpcClient, ok := w.adminClient.(*GrpcAdminClient)
|
grpcClient, ok := w.getAdmin().(*GrpcAdminClient)
|
||||||
if !ok {
|
if !ok {
|
||||||
glog.Warningf("MESSAGE LOOP UNAVAILABLE: Worker %s admin client is not gRPC client, message processing not available", w.id)
|
glog.Warningf("MESSAGE LOOP UNAVAILABLE: Worker %s admin client is not gRPC client, message processing not available", w.id)
|
||||||
return
|
return
|
||||||
@@ -694,10 +915,10 @@ func (w *Worker) messageProcessingLoop() {
|
|||||||
|
|
||||||
incomingChan := grpcClient.GetIncomingChannel()
|
incomingChan := grpcClient.GetIncomingChannel()
|
||||||
glog.V(1).Infof("MESSAGE CHANNEL READY: Worker %s connected to incoming message channel", w.id)
|
glog.V(1).Infof("MESSAGE CHANNEL READY: Worker %s connected to incoming message channel", w.id)
|
||||||
|
stopChan := w.getStopChan()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-w.stopChan:
|
case <-stopChan:
|
||||||
glog.Infof("MESSAGE LOOP STOPPING: Worker %s message processing loop stopping", w.id)
|
glog.Infof("MESSAGE LOOP STOPPING: Worker %s message processing loop stopping", w.id)
|
||||||
return
|
return
|
||||||
case message := <-incomingChan:
|
case message := <-incomingChan:
|
||||||
@@ -773,7 +994,7 @@ func (w *Worker) handleTaskLogRequest(request *worker_pb.TaskLogRequest) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
grpcClient, ok := w.adminClient.(*GrpcAdminClient)
|
grpcClient, ok := w.getAdmin().(*GrpcAdminClient)
|
||||||
if !ok {
|
if !ok {
|
||||||
glog.Errorf("Cannot send task log response: admin client is not gRPC client")
|
glog.Errorf("Cannot send task log response: admin client is not gRPC client")
|
||||||
return
|
return
|
||||||
@@ -791,14 +1012,10 @@ func (w *Worker) handleTaskLogRequest(request *worker_pb.TaskLogRequest) {
|
|||||||
func (w *Worker) handleTaskCancellation(cancellation *worker_pb.TaskCancellation) {
|
func (w *Worker) handleTaskCancellation(cancellation *worker_pb.TaskCancellation) {
|
||||||
glog.Infof("Worker %s received task cancellation for task %s", w.id, cancellation.TaskId)
|
glog.Infof("Worker %s received task cancellation for task %s", w.id, cancellation.TaskId)
|
||||||
|
|
||||||
w.mutex.Lock()
|
w.cmds <- workerCommand{
|
||||||
defer w.mutex.Unlock()
|
action: ActionCancelTask,
|
||||||
|
data: cancellation.TaskId,
|
||||||
if task, exists := w.currentTasks[cancellation.TaskId]; exists {
|
resp: nil,
|
||||||
// TODO: Implement task cancellation logic
|
|
||||||
glog.Infof("Cancelling task %s", task.ID)
|
|
||||||
} else {
|
|
||||||
glog.Warningf("Cannot cancel task %s: task not found", cancellation.TaskId)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user