seaweedFS/weed/worker/client.go

package worker

import (
	"context"
	"errors"
	"fmt"
	"io"
	"time"

	"github.com/seaweedfs/seaweedfs/weed/glog"
	"github.com/seaweedfs/seaweedfs/weed/pb"
	"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
	"github.com/seaweedfs/seaweedfs/weed/worker/types"
	"google.golang.org/grpc"
)

var (
	ErrAlreadyConnected = errors.New("already connected")
)

// GrpcAdminClient implements AdminClient using gRPC bidirectional streaming
type GrpcAdminClient struct {
	adminAddress string
	workerID     string
	dialOption   grpc.DialOption

	cmds      chan grpcCommand

	// Reconnection parameters
	maxReconnectAttempts int
	reconnectBackoff     time.Duration
	maxReconnectBackoff  time.Duration
	reconnectMultiplier  float64

	// Channels for communication
	outgoing      chan *worker_pb.WorkerMessage
	incoming      chan *worker_pb.AdminMessage
	responseChans map[string]chan *worker_pb.AdminMessage
}

type grpcAction string

const (
	ActionConnect              grpcAction = "connect"
	ActionDisconnect           grpcAction = "disconnect"
	ActionReconnect            grpcAction = "reconnect"
	ActionStreamError          grpcAction = "stream_error"
	ActionRegisterWorker       grpcAction = "register_worker"
	ActionQueryReconnecting    grpcAction = "query_reconnecting"
	ActionQueryConnected       grpcAction = "query_connected"
	ActionQueryShouldReconnect grpcAction = "query_shouldreconnect"
)

type registrationRequest struct {
	Worker *types.WorkerData
	Resp   chan error // Used to send the registration result back
}

type grpcCommand struct {
	action grpcAction
	data   any
	resp   chan error // for reporting success/failure
}

type grpcState struct {
	connected       bool
	reconnecting    bool
	shouldReconnect bool
	conn            *grpc.ClientConn
	client          worker_pb.WorkerServiceClient
	stream          worker_pb.WorkerService_WorkerStreamClient
	streamCtx       context.Context
	streamCancel    context.CancelFunc
	lastWorkerInfo  *types.WorkerData
	reconnectStop   chan struct{}
	streamExit      chan struct{}
}

// NewGrpcAdminClient creates a new gRPC admin client
func NewGrpcAdminClient(adminAddress string, workerID string, dialOption grpc.DialOption) *GrpcAdminClient {
	// Admin uses HTTP port + 10000 as gRPC port
	grpcAddress := pb.ServerToGrpcAddress(adminAddress)

	c := &GrpcAdminClient{
		adminAddress:         grpcAddress,
		workerID:             workerID,
		dialOption:           dialOption,
		maxReconnectAttempts: 0, // 0 means infinite attempts
		reconnectBackoff:     1 * time.Second,
		maxReconnectBackoff:  30 * time.Second,
		reconnectMultiplier:  1.5,
		outgoing:             make(chan *worker_pb.WorkerMessage, 100),
		incoming:             make(chan *worker_pb.AdminMessage, 100),
		responseChans:        make(map[string]chan *worker_pb.AdminMessage),
		cmds:                 make(chan grpcCommand),
	}
	go c.managerLoop()
	return c
}

func (c *GrpcAdminClient) managerLoop() {
	state := &grpcState{shouldReconnect: true}

out:
	for cmd := range c.cmds {
		switch cmd.action {
		case ActionConnect:
			c.handleConnect(cmd, state)
		case ActionDisconnect:
			c.handleDisconnect(cmd, state)
			break out
		case ActionReconnect:
			if state.connected || state.reconnecting || !state.shouldReconnect {
				cmd.resp <- ErrAlreadyConnected
				continue
			}
			state.reconnecting = true // Manager acknowledges the attempt
			err := c.reconnect(state)
			state.reconnecting = false
			cmd.resp <- err
		case ActionStreamError:
			state.connected = false
		case ActionRegisterWorker:
			req := cmd.data.(registrationRequest)
			state.lastWorkerInfo = req.Worker
			if !state.connected {
				glog.V(1).Infof("Not connected yet, worker info stored for registration upon connection")
				// Respond immediately with success (registration will happen later)
				req.Resp <- nil
				continue
			}
			err := c.sendRegistration(req.Worker)
			req.Resp <- err
		case ActionQueryConnected:
			respCh := cmd.data.(chan bool)
			respCh <- state.connected
		case ActionQueryReconnecting:
			respCh := cmd.data.(chan bool)
			respCh <- state.reconnecting
		case ActionQueryShouldReconnect:
			respCh := cmd.data.(chan bool)
			respCh <- state.shouldReconnect
		}
	}
}

// Connect establishes gRPC connection to admin server with TLS detection
func (c *GrpcAdminClient) Connect() error {
	resp := make(chan error)
	c.cmds <- grpcCommand{
		action: ActionConnect,
		resp:   resp,
	}
	return <-resp
}

func (c *GrpcAdminClient) handleConnect(cmd grpcCommand, s *grpcState) {
	if s.connected {
		cmd.resp <- fmt.Errorf("already connected")
		return
	}

	// Start reconnection loop immediately (async)
	stop := make(chan struct{})
	s.reconnectStop = stop
	go c.reconnectionLoop(stop)

	// Attempt the initial connection
	err := c.attemptConnection(s)
	if err != nil {
		glog.V(1).Infof("Initial connection failed, reconnection loop will retry: %v", err)
		cmd.resp <- err
		return
	}
	cmd.resp <- nil
}

// createConnection attempts to connect using the provided dial option
func (c *GrpcAdminClient) createConnection() (*grpc.ClientConn, error) {
	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
	defer cancel()

	conn, err := pb.GrpcDial(ctx, c.adminAddress, false, c.dialOption)
	if err != nil {
		return nil, fmt.Errorf("failed to connect to admin server: %w", err)
	}

	glog.Infof("Connected to admin server at %s", c.adminAddress)
	return conn, nil
}

// attemptConnection tries to establish the connection without managing the reconnection loop
func (c *GrpcAdminClient) attemptConnection(s *grpcState) error {
	// Detect TLS support and create appropriate connection
	conn, err := c.createConnection()
	if err != nil {
		return fmt.Errorf("failed to connect to admin server: %w", err)
	}

	s.conn = conn
	s.client = worker_pb.NewWorkerServiceClient(conn)

	// Create bidirectional stream
	s.streamCtx, s.streamCancel = context.WithCancel(context.Background())
	stream, err := s.client.WorkerStream(s.streamCtx)
	glog.Infof("Worker stream created")
	if err != nil {
		s.conn.Close()
		return fmt.Errorf("failed to create worker stream: %w", err)
	}
	s.connected = true
	s.stream = stream

	// Always check for worker info and send registration immediately as the very first message
	if s.lastWorkerInfo != nil {
		// Send registration synchronously as the very first message
		if err := c.sendRegistrationSync(s.lastWorkerInfo, s.stream); err != nil {
			s.conn.Close()
			s.connected = false
			return fmt.Errorf("failed to register worker: %w", err)
		}
		glog.Infof("Worker registered successfully with admin server")
	} else {
		// No worker info yet - stream will wait for registration
		glog.V(1).Infof("Connected to admin server, waiting for worker registration info")
	}

	// Start stream handlers
	s.streamExit = make(chan struct{})
	go handleOutgoing(s.stream, s.streamExit, c.outgoing, c.cmds)
	go handleIncoming(c.workerID, s.stream, s.streamExit, c.incoming, c.cmds)

	glog.Infof("Connected to admin server at %s", c.adminAddress)
	return nil
}

// reconnect attempts to re-establish the connection
func (c *GrpcAdminClient) reconnect(s *grpcState) error {
	// Clean up existing connection completely
	if s.streamCancel != nil {
		s.streamCancel()
	}
	if s.conn != nil {
		s.conn.Close()
	}
	s.connected = false

	// Attempt to re-establish connection using the same logic as initial connection
	if err := c.attemptConnection(s); err != nil {
		return fmt.Errorf("failed to reconnect: %w", err)
	}

	// Registration is now handled in attemptConnection if worker info is available
	return nil
}

// reconnectionLoop handles automatic reconnection with exponential backoff
func (c *GrpcAdminClient) reconnectionLoop(reconnectStop chan struct{}) {
	backoff := c.reconnectBackoff
	attempts := 0

	for {
		waitDuration := backoff
		if attempts == 0 {
			waitDuration = time.Second
		}
		select {
		case <-reconnectStop:
			return
		case <-time.After(waitDuration):
		}
		resp := make(chan error, 1)
		c.cmds <- grpcCommand{
			action: ActionReconnect,
			resp:   resp,
		}
		err := <-resp
		if err == nil {
			// Successful reconnection
			attempts = 0
			backoff = c.reconnectBackoff
			glog.Infof("Successfully reconnected to admin server")
		} else if errors.Is(err, ErrAlreadyConnected) {
			attempts = 0
			backoff = c.reconnectBackoff
		} else {
			attempts++
			glog.Errorf("Reconnection attempt %d failed: %v", attempts, err)

			// Check if we should give up
			if c.maxReconnectAttempts > 0 && attempts >= c.maxReconnectAttempts {
				glog.Errorf("Max reconnection attempts (%d) reached, giving up", c.maxReconnectAttempts)
				return
			}

			// Increase backoff
			backoff = time.Duration(float64(backoff) * c.reconnectMultiplier)
			if backoff > c.maxReconnectBackoff {
				backoff = c.maxReconnectBackoff
			}
			glog.Infof("Waiting %v before next reconnection attempt", backoff)
		}
	}
}

// handleOutgoing processes outgoing messages to admin
func handleOutgoing(
	stream worker_pb.WorkerService_WorkerStreamClient,
	streamExit <-chan struct{},
	outgoing <-chan *worker_pb.WorkerMessage,
	cmds chan<- grpcCommand) {

	msgCh := make(chan *worker_pb.WorkerMessage)
	errCh := make(chan error, 1) // Buffered to prevent blocking if the manager is busy
	// Goroutine to handle blocking stream.Recv() and simultaneously handle exit
	// signals
	go func() {
		for msg := range msgCh {
			if err := stream.Send(msg); err != nil {
				errCh <- err
				return // Exit the receiver goroutine on error/EOF
			}
		}
		close(errCh)
	}()

	for msg := range outgoing {
		select {
		case msgCh <- msg:
		case err := <-errCh:
			glog.Errorf("Failed to send message to admin: %v", err)
			cmds <- grpcCommand{action: ActionStreamError, data: err}
			return
		case <-streamExit:
			close(msgCh)
			<-errCh
			return
		}
	}
}

// handleIncoming processes incoming messages from admin
func handleIncoming(
	workerID string,
	stream worker_pb.WorkerService_WorkerStreamClient,
	streamExit <-chan struct{},
	incoming chan<- *worker_pb.AdminMessage,
	cmds chan<- grpcCommand) {
	glog.V(1).Infof("INCOMING HANDLER STARTED: Worker %s incoming message handler started", workerID)
	msgCh := make(chan *worker_pb.AdminMessage)
	errCh := make(chan error, 1) // Buffered to prevent blocking if the manager is busy
	// Goroutine to handle blocking stream.Recv() and simultaneously handle exit
	// signals
	go func() {
		for {
			msg, err := stream.Recv()
			if err != nil {
				errCh <- err
				return // Exit the receiver goroutine on error/EOF
			}
			msgCh <- msg
		}
	}()

	for {
		glog.V(4).Infof("LISTENING: Worker %s waiting for message from admin server", workerID)

		select {
		case msg := <-msgCh:
			// Message successfully received from the stream
			glog.V(4).Infof("MESSAGE RECEIVED: Worker %s received message from admin server: %T", workerID, msg.Message)

			// Route message to waiting goroutines or general handler (original select logic)
			select {
			case incoming <- msg:
				glog.V(3).Infof("MESSAGE ROUTED: Worker %s successfully routed message to handler", workerID)
			case <-time.After(time.Second):
				glog.Warningf("MESSAGE DROPPED: Worker %s incoming message buffer full, dropping message: %T", workerID, msg.Message)
			}

		case err := <-errCh:
			// Stream Receiver goroutine reported an error (EOF or network error)
			if err == io.EOF {
				glog.Infof("STREAM CLOSED: Worker %s admin server closed the stream", workerID)
			} else {
				glog.Errorf("RECEIVE ERROR: Worker %s failed to receive message from admin: %v", workerID, err)
			}

			// Report the failure as a command to the managerLoop (blocking)
			cmds <- grpcCommand{action: ActionStreamError, data: err}

			// Exit the main handler loop
			glog.V(1).Infof("INCOMING HANDLER STOPPED: Worker %s stopping incoming handler due to stream error", workerID)
			return

		case <-streamExit:
			// Manager closed this channel, signaling a controlled disconnection.
			glog.V(1).Infof("INCOMING HANDLER STOPPED: Worker %s stopping incoming handler - received exit signal", workerID)
			return
		}
	}
}

// Connect establishes gRPC connection to admin server with TLS detection
func (c *GrpcAdminClient) Disconnect() error {
	resp := make(chan error)
	c.cmds <- grpcCommand{
		action: ActionDisconnect,
		resp:   resp,
	}
	err := <-resp
	return err
}

func (c *GrpcAdminClient) handleDisconnect(cmd grpcCommand, s *grpcState) {
	if !s.connected {
		cmd.resp <- fmt.Errorf("already disconnected")
		return
	}

	// Send shutdown signal to stop reconnection loop
	close(s.reconnectStop)

	s.connected = false
	s.shouldReconnect = false

	// Send shutdown message
	shutdownMsg := &worker_pb.WorkerMessage{
		WorkerId:  c.workerID,
		Timestamp: time.Now().Unix(),
		Message: &worker_pb.WorkerMessage_Shutdown{
			Shutdown: &worker_pb.WorkerShutdown{
				WorkerId: c.workerID,
				Reason:   "normal shutdown",
			},
		},
	}

	// Close outgoing/incoming
	select {
	case c.outgoing <- shutdownMsg:
	case <-time.After(time.Second):
		glog.Warningf("Failed to send shutdown message")
	}

	// Send shutdown signal to stop handlers loop
	close(s.streamExit)

	// Cancel stream context
	if s.streamCancel != nil {
		s.streamCancel()
	}

	// Close connection
	if s.conn != nil {
		s.conn.Close()
	}

	// Close channels
	close(c.outgoing)
	close(c.incoming)

	glog.Infof("Disconnected from admin server")
	cmd.resp <- nil
}

// RegisterWorker registers the worker with the admin server
func (c *GrpcAdminClient) RegisterWorker(worker *types.WorkerData) error {
	respCh := make(chan error, 1)
	request := registrationRequest{
		Worker: worker,
		Resp:   respCh,
	}
	c.cmds <- grpcCommand{
		action: ActionRegisterWorker,
		data:   request,
	}
	return <-respCh
}

// sendRegistration sends the registration message and waits for response
func (c *GrpcAdminClient) sendRegistration(worker *types.WorkerData) error {
	capabilities := make([]string, len(worker.Capabilities))
	for i, cap := range worker.Capabilities {
		capabilities[i] = string(cap)
	}

	msg := &worker_pb.WorkerMessage{
		WorkerId:  c.workerID,
		Timestamp: time.Now().Unix(),
		Message: &worker_pb.WorkerMessage_Registration{
			Registration: &worker_pb.WorkerRegistration{
				WorkerId:      c.workerID,
				Address:       worker.Address,
				Capabilities:  capabilities,
				MaxConcurrent: int32(worker.MaxConcurrent),
				Metadata:      make(map[string]string),
			},
		},
	}

	select {
	case c.outgoing <- msg:
	case <-time.After(5 * time.Second):
		return fmt.Errorf("failed to send registration message: timeout")
	}

	// Wait for registration response
	timeout := time.NewTimer(10 * time.Second)
	defer timeout.Stop()

	for {
		select {
		case response := <-c.incoming:
			if regResp := response.GetRegistrationResponse(); regResp != nil {
				if regResp.Success {
					glog.Infof("Worker registered successfully: %s", regResp.Message)
					return nil
				}
				return fmt.Errorf("registration failed: %s", regResp.Message)
			}
		case <-timeout.C:
			return fmt.Errorf("registration timeout")
		}
	}
}

// sendRegistrationSync sends the registration message synchronously
func (c *GrpcAdminClient) sendRegistrationSync(worker *types.WorkerData, stream worker_pb.WorkerService_WorkerStreamClient) error {
	capabilities := make([]string, len(worker.Capabilities))
	for i, cap := range worker.Capabilities {
		capabilities[i] = string(cap)
	}

	msg := &worker_pb.WorkerMessage{
		WorkerId:  c.workerID,
		Timestamp: time.Now().Unix(),
		Message: &worker_pb.WorkerMessage_Registration{
			Registration: &worker_pb.WorkerRegistration{
				WorkerId:      c.workerID,
				Address:       worker.Address,
				Capabilities:  capabilities,
				MaxConcurrent: int32(worker.MaxConcurrent),
				Metadata:      make(map[string]string),
			},
		},
	}

	// Send directly to stream to ensure it's the first message
	if err := stream.Send(msg); err != nil {
		return fmt.Errorf("failed to send registration message: %w", err)
	}

	// Create a channel to receive the response
	responseChan := make(chan *worker_pb.AdminMessage, 1)
	errChan := make(chan error, 1)

	// Start a goroutine to listen for the response
	go func() {
		for {
			response, err := stream.Recv()
			if err != nil {
				errChan <- fmt.Errorf("failed to receive registration response: %w", err)
				return
			}

			if regResp := response.GetRegistrationResponse(); regResp != nil {
				responseChan <- response
				return
			}
			// Continue waiting if it's not a registration response
			// If stream is stuck, reconnect() will kill it, cleaning up this
			// goroutine
		}
	}()

	// Wait for registration response with timeout
	timeout := time.NewTimer(10 * time.Second)
	defer timeout.Stop()

	select {
	case response := <-responseChan:
		if regResp := response.GetRegistrationResponse(); regResp != nil {
			if regResp.Success {
				glog.V(1).Infof("Worker registered successfully: %s", regResp.Message)
				return nil
			}
			return fmt.Errorf("registration failed: %s", regResp.Message)
		}
		return fmt.Errorf("unexpected response type")
	case err := <-errChan:
		return err
	case <-timeout.C:
		return fmt.Errorf("registration timeout")
	}
}

func (c *GrpcAdminClient) IsConnected() bool {
	respCh := make(chan bool, 1)

	c.cmds <- grpcCommand{
		action: ActionQueryConnected,
		data:   respCh,
	}

	return <-respCh
}

func (c *GrpcAdminClient) IsReconnecting() bool {
	respCh := make(chan bool, 1)

	c.cmds <- grpcCommand{
		action: ActionQueryReconnecting,
		data:   respCh,
	}

	return <-respCh
}

func (c *GrpcAdminClient) ShouldReconnect() bool {
	respCh := make(chan bool, 1)

	c.cmds <- grpcCommand{
		action: ActionQueryShouldReconnect,
		data:   respCh,
	}

	return <-respCh
}

// SendHeartbeat sends heartbeat to admin server
func (c *GrpcAdminClient) SendHeartbeat(workerID string, status *types.WorkerStatus) error {
	if !c.IsConnected() {
		// If we're currently reconnecting, don't wait - just skip the heartbeat
		reconnecting := c.IsReconnecting()

		if reconnecting {
			// Don't treat as an error - reconnection is in progress
			glog.V(2).Infof("Skipping heartbeat during reconnection")
			return nil
		}

		// Wait for reconnection for a short time
		if err := c.waitForConnection(10 * time.Second); err != nil {
			return fmt.Errorf("not connected to admin server: %w", err)
		}
	}

	taskIds := make([]string, len(status.CurrentTasks))
	for i, task := range status.CurrentTasks {
		taskIds[i] = task.ID
	}

	msg := &worker_pb.WorkerMessage{
		WorkerId:  c.workerID,
		Timestamp: time.Now().Unix(),
		Message: &worker_pb.WorkerMessage_Heartbeat{
			Heartbeat: &worker_pb.WorkerHeartbeat{
				WorkerId:       c.workerID,
				Status:         status.Status,
				CurrentLoad:    int32(status.CurrentLoad),
				MaxConcurrent:  int32(status.MaxConcurrent),
				CurrentTaskIds: taskIds,
				TasksCompleted: int32(status.TasksCompleted),
				TasksFailed:    int32(status.TasksFailed),
				UptimeSeconds:  int64(status.Uptime.Seconds()),
			},
		},
	}

	select {
	case c.outgoing <- msg:
		return nil
	case <-time.After(time.Second):
		return fmt.Errorf("failed to send heartbeat: timeout")
	}
}

// RequestTask requests a new task from admin server
func (c *GrpcAdminClient) RequestTask(workerID string, capabilities []types.TaskType) (*types.TaskInput, error) {
	if !c.IsConnected() {
		// If we're currently reconnecting, don't wait - just return no task
		reconnecting := c.IsReconnecting()

		if reconnecting {
			// Don't treat as an error - reconnection is in progress
			glog.V(2).Infof("RECONNECTING: Worker %s skipping task request during reconnection", workerID)
			return nil, nil
		}

		// Wait for reconnection for a short time
		if err := c.waitForConnection(5 * time.Second); err != nil {
			return nil, fmt.Errorf("not connected to admin server: %w", err)
		}
	}

	caps := make([]string, len(capabilities))
	for i, cap := range capabilities {
		caps[i] = string(cap)
	}

	glog.V(3).Infof("📤 SENDING TASK REQUEST: Worker %s sending task request to admin server with capabilities: %v",
		workerID, capabilities)

	msg := &worker_pb.WorkerMessage{
		WorkerId:  c.workerID,
		Timestamp: time.Now().Unix(),
		Message: &worker_pb.WorkerMessage_TaskRequest{
			TaskRequest: &worker_pb.TaskRequest{
				WorkerId:       c.workerID,
				Capabilities:   caps,
				AvailableSlots: 1, // Request one task
			},
		},
	}

	select {
	case c.outgoing <- msg:
		glog.V(3).Infof("TASK REQUEST SENT: Worker %s successfully sent task request to admin server", workerID)
	case <-time.After(time.Second):
		glog.Errorf("TASK REQUEST TIMEOUT: Worker %s failed to send task request: timeout", workerID)
		return nil, fmt.Errorf("failed to send task request: timeout")
	}

	// Wait for task assignment
	glog.V(3).Infof("WAITING FOR RESPONSE: Worker %s waiting for task assignment response (5s timeout)", workerID)
	timeout := time.NewTimer(5 * time.Second)
	defer timeout.Stop()

	for {
		select {
		case response := <-c.incoming:
			glog.V(3).Infof("RESPONSE RECEIVED: Worker %s received response from admin server: %T", workerID, response.Message)
			if taskAssign := response.GetTaskAssignment(); taskAssign != nil {
				glog.V(1).Infof("Worker %s received task assignment in response: %s (type: %s, volume: %d)",
					workerID, taskAssign.TaskId, taskAssign.TaskType, taskAssign.Params.VolumeId)

				// Convert to our task type
				task := &types.TaskInput{
					ID:         taskAssign.TaskId,
					Type:       types.TaskType(taskAssign.TaskType),
					Status:     types.TaskStatusAssigned,
					VolumeID:   taskAssign.Params.VolumeId,
					Server:     getServerFromParams(taskAssign.Params),
					Collection: taskAssign.Params.Collection,
					Priority:   types.TaskPriority(taskAssign.Priority),
					CreatedAt:  time.Unix(taskAssign.CreatedTime, 0),
					// Use typed protobuf parameters directly
					TypedParams: taskAssign.Params,
				}
				return task, nil
			} else {
				glog.V(3).Infof("NON-TASK RESPONSE: Worker %s received non-task response: %T", workerID, response.Message)
			}
		case <-timeout.C:
			glog.V(3).Infof("TASK REQUEST TIMEOUT: Worker %s - no task assignment received within 5 seconds", workerID)
			return nil, nil // No task available
		}
	}
}

// CompleteTask reports task completion to admin server
func (c *GrpcAdminClient) CompleteTask(taskID string, success bool, errorMsg string) error {
	return c.CompleteTaskWithMetadata(taskID, success, errorMsg, nil)
}

// CompleteTaskWithMetadata reports task completion with additional metadata
func (c *GrpcAdminClient) CompleteTaskWithMetadata(taskID string, success bool, errorMsg string, metadata map[string]string) error {
	if !c.IsConnected() {
		// If we're currently reconnecting, don't wait - just skip the completion report
		reconnecting := c.IsReconnecting()

		if reconnecting {
			// Don't treat as an error - reconnection is in progress
			glog.V(2).Infof("Skipping task completion report during reconnection for task %s", taskID)
			return nil
		}

		// Wait for reconnection for a short time
		if err := c.waitForConnection(5 * time.Second); err != nil {
			return fmt.Errorf("not connected to admin server: %w", err)
		}
	}

	taskComplete := &worker_pb.TaskComplete{
		TaskId:         taskID,
		WorkerId:       c.workerID,
		Success:        success,
		ErrorMessage:   errorMsg,
		CompletionTime: time.Now().Unix(),
	}

	// Add metadata if provided
	if metadata != nil {
		taskComplete.ResultMetadata = metadata
	}

	msg := &worker_pb.WorkerMessage{
		WorkerId:  c.workerID,
		Timestamp: time.Now().Unix(),
		Message: &worker_pb.WorkerMessage_TaskComplete{
			TaskComplete: taskComplete,
		},
	}

	select {
	case c.outgoing <- msg:
		return nil
	case <-time.After(time.Second):
		return fmt.Errorf("failed to send task completion: timeout")
	}
}

// UpdateTaskProgress updates task progress to admin server
func (c *GrpcAdminClient) UpdateTaskProgress(taskID string, progress float64) error {
	if !c.IsConnected() {
		// If we're currently reconnecting, don't wait - just skip the progress update
		reconnecting := c.IsReconnecting()

		if reconnecting {
			// Don't treat as an error - reconnection is in progress
			glog.V(2).Infof("Skipping task progress update during reconnection for task %s", taskID)
			return nil
		}

		// Wait for reconnection for a short time
		if err := c.waitForConnection(5 * time.Second); err != nil {
			return fmt.Errorf("not connected to admin server: %w", err)
		}
	}

	msg := &worker_pb.WorkerMessage{
		WorkerId:  c.workerID,
		Timestamp: time.Now().Unix(),
		Message: &worker_pb.WorkerMessage_TaskUpdate{
			TaskUpdate: &worker_pb.TaskUpdate{
				TaskId:   taskID,
				WorkerId: c.workerID,
				Status:   "in_progress",
				Progress: float32(progress),
			},
		},
	}

	select {
	case c.outgoing <- msg:
		return nil
	case <-time.After(time.Second):
		return fmt.Errorf("failed to send task progress: timeout")
	}
}

// waitForConnection waits for the connection to be established or timeout
func (c *GrpcAdminClient) waitForConnection(timeout time.Duration) error {
	deadline := time.Now().Add(timeout)

	for time.Now().Before(deadline) {
		connected := c.IsConnected()
		shouldReconnect := c.ShouldReconnect()

		if connected {
			return nil
		}

		if !shouldReconnect {
			return fmt.Errorf("reconnection is disabled")
		}

		time.Sleep(100 * time.Millisecond)
	}

	return fmt.Errorf("timeout waiting for connection")
}

// GetIncomingChannel returns the incoming message channel for message processing
// This allows the worker to process admin messages directly
func (c *GrpcAdminClient) GetIncomingChannel() <-chan *worker_pb.AdminMessage {
	return c.incoming
}

// CreateAdminClient creates an admin client with the provided dial option
func CreateAdminClient(adminServer string, workerID string, dialOption grpc.DialOption) (AdminClient, error) {
	return NewGrpcAdminClient(adminServer, workerID, dialOption), nil
}

// getServerFromParams extracts server address from unified sources
func getServerFromParams(params *worker_pb.TaskParams) string {
	if len(params.Sources) > 0 {
		return params.Sources[0].Node
	}
	return ""
}