* ♻️ refactor(worker): remove goto * ♻️ refactor(worker): let manager loop exit by itself * ♻️ refactor(worker): fix race condition when closing worker CloseSend is not safe to call when another goroutine concurrently calls Send. streamCancel already handles proper stream closure. Also, streamExit signal should be called AFTER sending shutdownMsg Now the worker has no race condition if stopped during any moment (hopefully, tested with -race flag) * 🐛 fix(task_logger): deadlock in log closure * 🐛 fix(balance): fix balance task Removes the outdated "UnloadVolume" step as it is handled by "DeleteVolume". #7346
893 lines
25 KiB
Go
893 lines
25 KiB
Go
package worker
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/worker/types"
|
|
"google.golang.org/grpc"
|
|
)
|
|
|
|
var (
|
|
ErrAlreadyConnected = errors.New("already connected")
|
|
)
|
|
|
|
// GrpcAdminClient implements AdminClient using gRPC bidirectional streaming
|
|
type GrpcAdminClient struct {
|
|
adminAddress string
|
|
workerID string
|
|
dialOption grpc.DialOption
|
|
|
|
cmds chan grpcCommand
|
|
|
|
// Reconnection parameters
|
|
maxReconnectAttempts int
|
|
reconnectBackoff time.Duration
|
|
maxReconnectBackoff time.Duration
|
|
reconnectMultiplier float64
|
|
|
|
// Channels for communication
|
|
outgoing chan *worker_pb.WorkerMessage
|
|
incoming chan *worker_pb.AdminMessage
|
|
responseChans map[string]chan *worker_pb.AdminMessage
|
|
}
|
|
|
|
type grpcAction string
|
|
|
|
const (
|
|
ActionConnect grpcAction = "connect"
|
|
ActionDisconnect grpcAction = "disconnect"
|
|
ActionReconnect grpcAction = "reconnect"
|
|
ActionStreamError grpcAction = "stream_error"
|
|
ActionRegisterWorker grpcAction = "register_worker"
|
|
ActionQueryReconnecting grpcAction = "query_reconnecting"
|
|
ActionQueryConnected grpcAction = "query_connected"
|
|
ActionQueryShouldReconnect grpcAction = "query_shouldreconnect"
|
|
)
|
|
|
|
type registrationRequest struct {
|
|
Worker *types.WorkerData
|
|
Resp chan error // Used to send the registration result back
|
|
}
|
|
|
|
type grpcCommand struct {
|
|
action grpcAction
|
|
data any
|
|
resp chan error // for reporting success/failure
|
|
}
|
|
|
|
type grpcState struct {
|
|
connected bool
|
|
reconnecting bool
|
|
shouldReconnect bool
|
|
conn *grpc.ClientConn
|
|
client worker_pb.WorkerServiceClient
|
|
stream worker_pb.WorkerService_WorkerStreamClient
|
|
streamCtx context.Context
|
|
streamCancel context.CancelFunc
|
|
lastWorkerInfo *types.WorkerData
|
|
reconnectStop chan struct{}
|
|
streamExit chan struct{}
|
|
}
|
|
|
|
// NewGrpcAdminClient creates a new gRPC admin client
|
|
func NewGrpcAdminClient(adminAddress string, workerID string, dialOption grpc.DialOption) *GrpcAdminClient {
|
|
// Admin uses HTTP port + 10000 as gRPC port
|
|
grpcAddress := pb.ServerToGrpcAddress(adminAddress)
|
|
|
|
c := &GrpcAdminClient{
|
|
adminAddress: grpcAddress,
|
|
workerID: workerID,
|
|
dialOption: dialOption,
|
|
maxReconnectAttempts: 0, // 0 means infinite attempts
|
|
reconnectBackoff: 1 * time.Second,
|
|
maxReconnectBackoff: 30 * time.Second,
|
|
reconnectMultiplier: 1.5,
|
|
outgoing: make(chan *worker_pb.WorkerMessage, 100),
|
|
incoming: make(chan *worker_pb.AdminMessage, 100),
|
|
responseChans: make(map[string]chan *worker_pb.AdminMessage),
|
|
cmds: make(chan grpcCommand),
|
|
}
|
|
go c.managerLoop()
|
|
return c
|
|
}
|
|
|
|
func (c *GrpcAdminClient) managerLoop() {
|
|
state := &grpcState{shouldReconnect: true}
|
|
|
|
out:
|
|
for cmd := range c.cmds {
|
|
switch cmd.action {
|
|
case ActionConnect:
|
|
c.handleConnect(cmd, state)
|
|
case ActionDisconnect:
|
|
c.handleDisconnect(cmd, state)
|
|
break out
|
|
case ActionReconnect:
|
|
if state.connected || state.reconnecting || !state.shouldReconnect {
|
|
cmd.resp <- ErrAlreadyConnected
|
|
continue
|
|
}
|
|
state.reconnecting = true // Manager acknowledges the attempt
|
|
err := c.reconnect(state)
|
|
state.reconnecting = false
|
|
cmd.resp <- err
|
|
case ActionStreamError:
|
|
state.connected = false
|
|
case ActionRegisterWorker:
|
|
req := cmd.data.(registrationRequest)
|
|
state.lastWorkerInfo = req.Worker
|
|
if !state.connected {
|
|
glog.V(1).Infof("Not connected yet, worker info stored for registration upon connection")
|
|
// Respond immediately with success (registration will happen later)
|
|
req.Resp <- nil
|
|
continue
|
|
}
|
|
err := c.sendRegistration(req.Worker)
|
|
req.Resp <- err
|
|
case ActionQueryConnected:
|
|
respCh := cmd.data.(chan bool)
|
|
respCh <- state.connected
|
|
case ActionQueryReconnecting:
|
|
respCh := cmd.data.(chan bool)
|
|
respCh <- state.reconnecting
|
|
case ActionQueryShouldReconnect:
|
|
respCh := cmd.data.(chan bool)
|
|
respCh <- state.shouldReconnect
|
|
}
|
|
}
|
|
}
|
|
|
|
// Connect establishes gRPC connection to admin server with TLS detection
|
|
func (c *GrpcAdminClient) Connect() error {
|
|
resp := make(chan error)
|
|
c.cmds <- grpcCommand{
|
|
action: ActionConnect,
|
|
resp: resp,
|
|
}
|
|
return <-resp
|
|
}
|
|
|
|
func (c *GrpcAdminClient) handleConnect(cmd grpcCommand, s *grpcState) {
|
|
if s.connected {
|
|
cmd.resp <- fmt.Errorf("already connected")
|
|
return
|
|
}
|
|
|
|
// Start reconnection loop immediately (async)
|
|
stop := make(chan struct{})
|
|
s.reconnectStop = stop
|
|
go c.reconnectionLoop(stop)
|
|
|
|
// Attempt the initial connection
|
|
err := c.attemptConnection(s)
|
|
if err != nil {
|
|
glog.V(1).Infof("Initial connection failed, reconnection loop will retry: %v", err)
|
|
cmd.resp <- err
|
|
return
|
|
}
|
|
cmd.resp <- nil
|
|
}
|
|
|
|
// createConnection attempts to connect using the provided dial option
|
|
func (c *GrpcAdminClient) createConnection() (*grpc.ClientConn, error) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
conn, err := pb.GrpcDial(ctx, c.adminAddress, false, c.dialOption)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to connect to admin server: %w", err)
|
|
}
|
|
|
|
glog.Infof("Connected to admin server at %s", c.adminAddress)
|
|
return conn, nil
|
|
}
|
|
|
|
// attemptConnection tries to establish the connection without managing the reconnection loop
|
|
func (c *GrpcAdminClient) attemptConnection(s *grpcState) error {
|
|
// Detect TLS support and create appropriate connection
|
|
conn, err := c.createConnection()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to connect to admin server: %w", err)
|
|
}
|
|
|
|
s.conn = conn
|
|
s.client = worker_pb.NewWorkerServiceClient(conn)
|
|
|
|
// Create bidirectional stream
|
|
s.streamCtx, s.streamCancel = context.WithCancel(context.Background())
|
|
stream, err := s.client.WorkerStream(s.streamCtx)
|
|
glog.Infof("Worker stream created")
|
|
if err != nil {
|
|
s.conn.Close()
|
|
return fmt.Errorf("failed to create worker stream: %w", err)
|
|
}
|
|
s.connected = true
|
|
s.stream = stream
|
|
|
|
// Always check for worker info and send registration immediately as the very first message
|
|
if s.lastWorkerInfo != nil {
|
|
// Send registration synchronously as the very first message
|
|
if err := c.sendRegistrationSync(s.lastWorkerInfo, s.stream); err != nil {
|
|
s.conn.Close()
|
|
s.connected = false
|
|
return fmt.Errorf("failed to register worker: %w", err)
|
|
}
|
|
glog.Infof("Worker registered successfully with admin server")
|
|
} else {
|
|
// No worker info yet - stream will wait for registration
|
|
glog.V(1).Infof("Connected to admin server, waiting for worker registration info")
|
|
}
|
|
|
|
// Start stream handlers
|
|
s.streamExit = make(chan struct{})
|
|
go handleOutgoing(s.stream, s.streamExit, c.outgoing, c.cmds)
|
|
go handleIncoming(c.workerID, s.stream, s.streamExit, c.incoming, c.cmds)
|
|
|
|
glog.Infof("Connected to admin server at %s", c.adminAddress)
|
|
return nil
|
|
}
|
|
|
|
// reconnect attempts to re-establish the connection
|
|
func (c *GrpcAdminClient) reconnect(s *grpcState) error {
|
|
// Clean up existing connection completely
|
|
if s.streamCancel != nil {
|
|
s.streamCancel()
|
|
}
|
|
if s.conn != nil {
|
|
s.conn.Close()
|
|
}
|
|
s.connected = false
|
|
|
|
// Attempt to re-establish connection using the same logic as initial connection
|
|
if err := c.attemptConnection(s); err != nil {
|
|
return fmt.Errorf("failed to reconnect: %w", err)
|
|
}
|
|
|
|
// Registration is now handled in attemptConnection if worker info is available
|
|
return nil
|
|
}
|
|
|
|
// reconnectionLoop handles automatic reconnection with exponential backoff
|
|
func (c *GrpcAdminClient) reconnectionLoop(reconnectStop chan struct{}) {
|
|
backoff := c.reconnectBackoff
|
|
attempts := 0
|
|
|
|
for {
|
|
waitDuration := backoff
|
|
if attempts == 0 {
|
|
waitDuration = time.Second
|
|
}
|
|
select {
|
|
case <-reconnectStop:
|
|
return
|
|
case <-time.After(waitDuration):
|
|
}
|
|
resp := make(chan error, 1)
|
|
c.cmds <- grpcCommand{
|
|
action: ActionReconnect,
|
|
resp: resp,
|
|
}
|
|
err := <-resp
|
|
if err == nil {
|
|
// Successful reconnection
|
|
attempts = 0
|
|
backoff = c.reconnectBackoff
|
|
glog.Infof("Successfully reconnected to admin server")
|
|
} else if errors.Is(err, ErrAlreadyConnected) {
|
|
attempts = 0
|
|
backoff = c.reconnectBackoff
|
|
} else {
|
|
attempts++
|
|
glog.Errorf("Reconnection attempt %d failed: %v", attempts, err)
|
|
|
|
// Check if we should give up
|
|
if c.maxReconnectAttempts > 0 && attempts >= c.maxReconnectAttempts {
|
|
glog.Errorf("Max reconnection attempts (%d) reached, giving up", c.maxReconnectAttempts)
|
|
return
|
|
}
|
|
|
|
// Increase backoff
|
|
backoff = time.Duration(float64(backoff) * c.reconnectMultiplier)
|
|
if backoff > c.maxReconnectBackoff {
|
|
backoff = c.maxReconnectBackoff
|
|
}
|
|
glog.Infof("Waiting %v before next reconnection attempt", backoff)
|
|
}
|
|
}
|
|
}
|
|
|
|
// handleOutgoing processes outgoing messages to admin
|
|
func handleOutgoing(
|
|
stream worker_pb.WorkerService_WorkerStreamClient,
|
|
streamExit <-chan struct{},
|
|
outgoing <-chan *worker_pb.WorkerMessage,
|
|
cmds chan<- grpcCommand) {
|
|
|
|
msgCh := make(chan *worker_pb.WorkerMessage)
|
|
errCh := make(chan error, 1) // Buffered to prevent blocking if the manager is busy
|
|
// Goroutine to handle blocking stream.Recv() and simultaneously handle exit
|
|
// signals
|
|
go func() {
|
|
for msg := range msgCh {
|
|
if err := stream.Send(msg); err != nil {
|
|
errCh <- err
|
|
return // Exit the receiver goroutine on error/EOF
|
|
}
|
|
}
|
|
close(errCh)
|
|
}()
|
|
|
|
for msg := range outgoing {
|
|
select {
|
|
case msgCh <- msg:
|
|
case err := <-errCh:
|
|
glog.Errorf("Failed to send message to admin: %v", err)
|
|
cmds <- grpcCommand{action: ActionStreamError, data: err}
|
|
return
|
|
case <-streamExit:
|
|
close(msgCh)
|
|
<-errCh
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// handleIncoming processes incoming messages from admin
|
|
func handleIncoming(
|
|
workerID string,
|
|
stream worker_pb.WorkerService_WorkerStreamClient,
|
|
streamExit <-chan struct{},
|
|
incoming chan<- *worker_pb.AdminMessage,
|
|
cmds chan<- grpcCommand) {
|
|
glog.V(1).Infof("INCOMING HANDLER STARTED: Worker %s incoming message handler started", workerID)
|
|
msgCh := make(chan *worker_pb.AdminMessage)
|
|
errCh := make(chan error, 1) // Buffered to prevent blocking if the manager is busy
|
|
// Goroutine to handle blocking stream.Recv() and simultaneously handle exit
|
|
// signals
|
|
go func() {
|
|
for {
|
|
msg, err := stream.Recv()
|
|
if err != nil {
|
|
errCh <- err
|
|
return // Exit the receiver goroutine on error/EOF
|
|
}
|
|
msgCh <- msg
|
|
}
|
|
}()
|
|
|
|
for {
|
|
glog.V(4).Infof("LISTENING: Worker %s waiting for message from admin server", workerID)
|
|
|
|
select {
|
|
case msg := <-msgCh:
|
|
// Message successfully received from the stream
|
|
glog.V(4).Infof("MESSAGE RECEIVED: Worker %s received message from admin server: %T", workerID, msg.Message)
|
|
|
|
// Route message to waiting goroutines or general handler (original select logic)
|
|
select {
|
|
case incoming <- msg:
|
|
glog.V(3).Infof("MESSAGE ROUTED: Worker %s successfully routed message to handler", workerID)
|
|
case <-time.After(time.Second):
|
|
glog.Warningf("MESSAGE DROPPED: Worker %s incoming message buffer full, dropping message: %T", workerID, msg.Message)
|
|
}
|
|
|
|
case err := <-errCh:
|
|
// Stream Receiver goroutine reported an error (EOF or network error)
|
|
if err == io.EOF {
|
|
glog.Infof("STREAM CLOSED: Worker %s admin server closed the stream", workerID)
|
|
} else {
|
|
glog.Errorf("RECEIVE ERROR: Worker %s failed to receive message from admin: %v", workerID, err)
|
|
}
|
|
|
|
// Report the failure as a command to the managerLoop (blocking)
|
|
cmds <- grpcCommand{action: ActionStreamError, data: err}
|
|
|
|
// Exit the main handler loop
|
|
glog.V(1).Infof("INCOMING HANDLER STOPPED: Worker %s stopping incoming handler due to stream error", workerID)
|
|
return
|
|
|
|
case <-streamExit:
|
|
// Manager closed this channel, signaling a controlled disconnection.
|
|
glog.V(1).Infof("INCOMING HANDLER STOPPED: Worker %s stopping incoming handler - received exit signal", workerID)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Connect establishes gRPC connection to admin server with TLS detection
|
|
func (c *GrpcAdminClient) Disconnect() error {
|
|
resp := make(chan error)
|
|
c.cmds <- grpcCommand{
|
|
action: ActionDisconnect,
|
|
resp: resp,
|
|
}
|
|
err := <-resp
|
|
return err
|
|
}
|
|
|
|
func (c *GrpcAdminClient) handleDisconnect(cmd grpcCommand, s *grpcState) {
|
|
if !s.connected {
|
|
cmd.resp <- fmt.Errorf("already disconnected")
|
|
return
|
|
}
|
|
|
|
// Send shutdown signal to stop reconnection loop
|
|
close(s.reconnectStop)
|
|
|
|
s.connected = false
|
|
s.shouldReconnect = false
|
|
|
|
// Send shutdown message
|
|
shutdownMsg := &worker_pb.WorkerMessage{
|
|
WorkerId: c.workerID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.WorkerMessage_Shutdown{
|
|
Shutdown: &worker_pb.WorkerShutdown{
|
|
WorkerId: c.workerID,
|
|
Reason: "normal shutdown",
|
|
},
|
|
},
|
|
}
|
|
|
|
// Close outgoing/incoming
|
|
select {
|
|
case c.outgoing <- shutdownMsg:
|
|
case <-time.After(time.Second):
|
|
glog.Warningf("Failed to send shutdown message")
|
|
}
|
|
|
|
// Send shutdown signal to stop handlers loop
|
|
close(s.streamExit)
|
|
|
|
// Cancel stream context
|
|
if s.streamCancel != nil {
|
|
s.streamCancel()
|
|
}
|
|
|
|
// Close connection
|
|
if s.conn != nil {
|
|
s.conn.Close()
|
|
}
|
|
|
|
// Close channels
|
|
close(c.outgoing)
|
|
close(c.incoming)
|
|
|
|
glog.Infof("Disconnected from admin server")
|
|
cmd.resp <- nil
|
|
}
|
|
|
|
// RegisterWorker registers the worker with the admin server
|
|
func (c *GrpcAdminClient) RegisterWorker(worker *types.WorkerData) error {
|
|
respCh := make(chan error, 1)
|
|
request := registrationRequest{
|
|
Worker: worker,
|
|
Resp: respCh,
|
|
}
|
|
c.cmds <- grpcCommand{
|
|
action: ActionRegisterWorker,
|
|
data: request,
|
|
}
|
|
return <-respCh
|
|
}
|
|
|
|
// sendRegistration sends the registration message and waits for response
|
|
func (c *GrpcAdminClient) sendRegistration(worker *types.WorkerData) error {
|
|
capabilities := make([]string, len(worker.Capabilities))
|
|
for i, cap := range worker.Capabilities {
|
|
capabilities[i] = string(cap)
|
|
}
|
|
|
|
msg := &worker_pb.WorkerMessage{
|
|
WorkerId: c.workerID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.WorkerMessage_Registration{
|
|
Registration: &worker_pb.WorkerRegistration{
|
|
WorkerId: c.workerID,
|
|
Address: worker.Address,
|
|
Capabilities: capabilities,
|
|
MaxConcurrent: int32(worker.MaxConcurrent),
|
|
Metadata: make(map[string]string),
|
|
},
|
|
},
|
|
}
|
|
|
|
select {
|
|
case c.outgoing <- msg:
|
|
case <-time.After(5 * time.Second):
|
|
return fmt.Errorf("failed to send registration message: timeout")
|
|
}
|
|
|
|
// Wait for registration response
|
|
timeout := time.NewTimer(10 * time.Second)
|
|
defer timeout.Stop()
|
|
|
|
for {
|
|
select {
|
|
case response := <-c.incoming:
|
|
if regResp := response.GetRegistrationResponse(); regResp != nil {
|
|
if regResp.Success {
|
|
glog.Infof("Worker registered successfully: %s", regResp.Message)
|
|
return nil
|
|
}
|
|
return fmt.Errorf("registration failed: %s", regResp.Message)
|
|
}
|
|
case <-timeout.C:
|
|
return fmt.Errorf("registration timeout")
|
|
}
|
|
}
|
|
}
|
|
|
|
// sendRegistrationSync sends the registration message synchronously
|
|
func (c *GrpcAdminClient) sendRegistrationSync(worker *types.WorkerData, stream worker_pb.WorkerService_WorkerStreamClient) error {
|
|
capabilities := make([]string, len(worker.Capabilities))
|
|
for i, cap := range worker.Capabilities {
|
|
capabilities[i] = string(cap)
|
|
}
|
|
|
|
msg := &worker_pb.WorkerMessage{
|
|
WorkerId: c.workerID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.WorkerMessage_Registration{
|
|
Registration: &worker_pb.WorkerRegistration{
|
|
WorkerId: c.workerID,
|
|
Address: worker.Address,
|
|
Capabilities: capabilities,
|
|
MaxConcurrent: int32(worker.MaxConcurrent),
|
|
Metadata: make(map[string]string),
|
|
},
|
|
},
|
|
}
|
|
|
|
// Send directly to stream to ensure it's the first message
|
|
if err := stream.Send(msg); err != nil {
|
|
return fmt.Errorf("failed to send registration message: %w", err)
|
|
}
|
|
|
|
// Create a channel to receive the response
|
|
responseChan := make(chan *worker_pb.AdminMessage, 1)
|
|
errChan := make(chan error, 1)
|
|
|
|
// Start a goroutine to listen for the response
|
|
go func() {
|
|
for {
|
|
response, err := stream.Recv()
|
|
if err != nil {
|
|
errChan <- fmt.Errorf("failed to receive registration response: %w", err)
|
|
return
|
|
}
|
|
|
|
if regResp := response.GetRegistrationResponse(); regResp != nil {
|
|
responseChan <- response
|
|
return
|
|
}
|
|
// Continue waiting if it's not a registration response
|
|
// If stream is stuck, reconnect() will kill it, cleaning up this
|
|
// goroutine
|
|
}
|
|
}()
|
|
|
|
// Wait for registration response with timeout
|
|
timeout := time.NewTimer(10 * time.Second)
|
|
defer timeout.Stop()
|
|
|
|
select {
|
|
case response := <-responseChan:
|
|
if regResp := response.GetRegistrationResponse(); regResp != nil {
|
|
if regResp.Success {
|
|
glog.V(1).Infof("Worker registered successfully: %s", regResp.Message)
|
|
return nil
|
|
}
|
|
return fmt.Errorf("registration failed: %s", regResp.Message)
|
|
}
|
|
return fmt.Errorf("unexpected response type")
|
|
case err := <-errChan:
|
|
return err
|
|
case <-timeout.C:
|
|
return fmt.Errorf("registration timeout")
|
|
}
|
|
}
|
|
|
|
func (c *GrpcAdminClient) IsConnected() bool {
|
|
respCh := make(chan bool, 1)
|
|
|
|
c.cmds <- grpcCommand{
|
|
action: ActionQueryConnected,
|
|
data: respCh,
|
|
}
|
|
|
|
return <-respCh
|
|
}
|
|
|
|
func (c *GrpcAdminClient) IsReconnecting() bool {
|
|
respCh := make(chan bool, 1)
|
|
|
|
c.cmds <- grpcCommand{
|
|
action: ActionQueryReconnecting,
|
|
data: respCh,
|
|
}
|
|
|
|
return <-respCh
|
|
}
|
|
|
|
func (c *GrpcAdminClient) ShouldReconnect() bool {
|
|
respCh := make(chan bool, 1)
|
|
|
|
c.cmds <- grpcCommand{
|
|
action: ActionQueryShouldReconnect,
|
|
data: respCh,
|
|
}
|
|
|
|
return <-respCh
|
|
}
|
|
|
|
// SendHeartbeat sends heartbeat to admin server
|
|
func (c *GrpcAdminClient) SendHeartbeat(workerID string, status *types.WorkerStatus) error {
|
|
if !c.IsConnected() {
|
|
// If we're currently reconnecting, don't wait - just skip the heartbeat
|
|
reconnecting := c.IsReconnecting()
|
|
|
|
if reconnecting {
|
|
// Don't treat as an error - reconnection is in progress
|
|
glog.V(2).Infof("Skipping heartbeat during reconnection")
|
|
return nil
|
|
}
|
|
|
|
// Wait for reconnection for a short time
|
|
if err := c.waitForConnection(10 * time.Second); err != nil {
|
|
return fmt.Errorf("not connected to admin server: %w", err)
|
|
}
|
|
}
|
|
|
|
taskIds := make([]string, len(status.CurrentTasks))
|
|
for i, task := range status.CurrentTasks {
|
|
taskIds[i] = task.ID
|
|
}
|
|
|
|
msg := &worker_pb.WorkerMessage{
|
|
WorkerId: c.workerID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.WorkerMessage_Heartbeat{
|
|
Heartbeat: &worker_pb.WorkerHeartbeat{
|
|
WorkerId: c.workerID,
|
|
Status: status.Status,
|
|
CurrentLoad: int32(status.CurrentLoad),
|
|
MaxConcurrent: int32(status.MaxConcurrent),
|
|
CurrentTaskIds: taskIds,
|
|
TasksCompleted: int32(status.TasksCompleted),
|
|
TasksFailed: int32(status.TasksFailed),
|
|
UptimeSeconds: int64(status.Uptime.Seconds()),
|
|
},
|
|
},
|
|
}
|
|
|
|
select {
|
|
case c.outgoing <- msg:
|
|
return nil
|
|
case <-time.After(time.Second):
|
|
return fmt.Errorf("failed to send heartbeat: timeout")
|
|
}
|
|
}
|
|
|
|
// RequestTask requests a new task from admin server
|
|
func (c *GrpcAdminClient) RequestTask(workerID string, capabilities []types.TaskType) (*types.TaskInput, error) {
|
|
if !c.IsConnected() {
|
|
// If we're currently reconnecting, don't wait - just return no task
|
|
reconnecting := c.IsReconnecting()
|
|
|
|
if reconnecting {
|
|
// Don't treat as an error - reconnection is in progress
|
|
glog.V(2).Infof("RECONNECTING: Worker %s skipping task request during reconnection", workerID)
|
|
return nil, nil
|
|
}
|
|
|
|
// Wait for reconnection for a short time
|
|
if err := c.waitForConnection(5 * time.Second); err != nil {
|
|
return nil, fmt.Errorf("not connected to admin server: %w", err)
|
|
}
|
|
}
|
|
|
|
caps := make([]string, len(capabilities))
|
|
for i, cap := range capabilities {
|
|
caps[i] = string(cap)
|
|
}
|
|
|
|
glog.V(3).Infof("📤 SENDING TASK REQUEST: Worker %s sending task request to admin server with capabilities: %v",
|
|
workerID, capabilities)
|
|
|
|
msg := &worker_pb.WorkerMessage{
|
|
WorkerId: c.workerID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.WorkerMessage_TaskRequest{
|
|
TaskRequest: &worker_pb.TaskRequest{
|
|
WorkerId: c.workerID,
|
|
Capabilities: caps,
|
|
AvailableSlots: 1, // Request one task
|
|
},
|
|
},
|
|
}
|
|
|
|
select {
|
|
case c.outgoing <- msg:
|
|
glog.V(3).Infof("TASK REQUEST SENT: Worker %s successfully sent task request to admin server", workerID)
|
|
case <-time.After(time.Second):
|
|
glog.Errorf("TASK REQUEST TIMEOUT: Worker %s failed to send task request: timeout", workerID)
|
|
return nil, fmt.Errorf("failed to send task request: timeout")
|
|
}
|
|
|
|
// Wait for task assignment
|
|
glog.V(3).Infof("WAITING FOR RESPONSE: Worker %s waiting for task assignment response (5s timeout)", workerID)
|
|
timeout := time.NewTimer(5 * time.Second)
|
|
defer timeout.Stop()
|
|
|
|
for {
|
|
select {
|
|
case response := <-c.incoming:
|
|
glog.V(3).Infof("RESPONSE RECEIVED: Worker %s received response from admin server: %T", workerID, response.Message)
|
|
if taskAssign := response.GetTaskAssignment(); taskAssign != nil {
|
|
glog.V(1).Infof("Worker %s received task assignment in response: %s (type: %s, volume: %d)",
|
|
workerID, taskAssign.TaskId, taskAssign.TaskType, taskAssign.Params.VolumeId)
|
|
|
|
// Convert to our task type
|
|
task := &types.TaskInput{
|
|
ID: taskAssign.TaskId,
|
|
Type: types.TaskType(taskAssign.TaskType),
|
|
Status: types.TaskStatusAssigned,
|
|
VolumeID: taskAssign.Params.VolumeId,
|
|
Server: getServerFromParams(taskAssign.Params),
|
|
Collection: taskAssign.Params.Collection,
|
|
Priority: types.TaskPriority(taskAssign.Priority),
|
|
CreatedAt: time.Unix(taskAssign.CreatedTime, 0),
|
|
// Use typed protobuf parameters directly
|
|
TypedParams: taskAssign.Params,
|
|
}
|
|
return task, nil
|
|
} else {
|
|
glog.V(3).Infof("NON-TASK RESPONSE: Worker %s received non-task response: %T", workerID, response.Message)
|
|
}
|
|
case <-timeout.C:
|
|
glog.V(3).Infof("TASK REQUEST TIMEOUT: Worker %s - no task assignment received within 5 seconds", workerID)
|
|
return nil, nil // No task available
|
|
}
|
|
}
|
|
}
|
|
|
|
// CompleteTask reports task completion to admin server
|
|
func (c *GrpcAdminClient) CompleteTask(taskID string, success bool, errorMsg string) error {
|
|
return c.CompleteTaskWithMetadata(taskID, success, errorMsg, nil)
|
|
}
|
|
|
|
// CompleteTaskWithMetadata reports task completion with additional metadata
|
|
func (c *GrpcAdminClient) CompleteTaskWithMetadata(taskID string, success bool, errorMsg string, metadata map[string]string) error {
|
|
if !c.IsConnected() {
|
|
// If we're currently reconnecting, don't wait - just skip the completion report
|
|
reconnecting := c.IsReconnecting()
|
|
|
|
if reconnecting {
|
|
// Don't treat as an error - reconnection is in progress
|
|
glog.V(2).Infof("Skipping task completion report during reconnection for task %s", taskID)
|
|
return nil
|
|
}
|
|
|
|
// Wait for reconnection for a short time
|
|
if err := c.waitForConnection(5 * time.Second); err != nil {
|
|
return fmt.Errorf("not connected to admin server: %w", err)
|
|
}
|
|
}
|
|
|
|
taskComplete := &worker_pb.TaskComplete{
|
|
TaskId: taskID,
|
|
WorkerId: c.workerID,
|
|
Success: success,
|
|
ErrorMessage: errorMsg,
|
|
CompletionTime: time.Now().Unix(),
|
|
}
|
|
|
|
// Add metadata if provided
|
|
if metadata != nil {
|
|
taskComplete.ResultMetadata = metadata
|
|
}
|
|
|
|
msg := &worker_pb.WorkerMessage{
|
|
WorkerId: c.workerID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.WorkerMessage_TaskComplete{
|
|
TaskComplete: taskComplete,
|
|
},
|
|
}
|
|
|
|
select {
|
|
case c.outgoing <- msg:
|
|
return nil
|
|
case <-time.After(time.Second):
|
|
return fmt.Errorf("failed to send task completion: timeout")
|
|
}
|
|
}
|
|
|
|
// UpdateTaskProgress updates task progress to admin server
|
|
func (c *GrpcAdminClient) UpdateTaskProgress(taskID string, progress float64) error {
|
|
if !c.IsConnected() {
|
|
// If we're currently reconnecting, don't wait - just skip the progress update
|
|
reconnecting := c.IsReconnecting()
|
|
|
|
if reconnecting {
|
|
// Don't treat as an error - reconnection is in progress
|
|
glog.V(2).Infof("Skipping task progress update during reconnection for task %s", taskID)
|
|
return nil
|
|
}
|
|
|
|
// Wait for reconnection for a short time
|
|
if err := c.waitForConnection(5 * time.Second); err != nil {
|
|
return fmt.Errorf("not connected to admin server: %w", err)
|
|
}
|
|
}
|
|
|
|
msg := &worker_pb.WorkerMessage{
|
|
WorkerId: c.workerID,
|
|
Timestamp: time.Now().Unix(),
|
|
Message: &worker_pb.WorkerMessage_TaskUpdate{
|
|
TaskUpdate: &worker_pb.TaskUpdate{
|
|
TaskId: taskID,
|
|
WorkerId: c.workerID,
|
|
Status: "in_progress",
|
|
Progress: float32(progress),
|
|
},
|
|
},
|
|
}
|
|
|
|
select {
|
|
case c.outgoing <- msg:
|
|
return nil
|
|
case <-time.After(time.Second):
|
|
return fmt.Errorf("failed to send task progress: timeout")
|
|
}
|
|
}
|
|
|
|
// waitForConnection waits for the connection to be established or timeout
|
|
func (c *GrpcAdminClient) waitForConnection(timeout time.Duration) error {
|
|
deadline := time.Now().Add(timeout)
|
|
|
|
for time.Now().Before(deadline) {
|
|
connected := c.IsConnected()
|
|
shouldReconnect := c.ShouldReconnect()
|
|
|
|
if connected {
|
|
return nil
|
|
}
|
|
|
|
if !shouldReconnect {
|
|
return fmt.Errorf("reconnection is disabled")
|
|
}
|
|
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
|
|
return fmt.Errorf("timeout waiting for connection")
|
|
}
|
|
|
|
// GetIncomingChannel returns the incoming message channel for message processing
|
|
// This allows the worker to process admin messages directly
|
|
func (c *GrpcAdminClient) GetIncomingChannel() <-chan *worker_pb.AdminMessage {
|
|
return c.incoming
|
|
}
|
|
|
|
// CreateAdminClient creates an admin client with the provided dial option
|
|
func CreateAdminClient(adminServer string, workerID string, dialOption grpc.DialOption) (AdminClient, error) {
|
|
return NewGrpcAdminClient(adminServer, workerID, dialOption), nil
|
|
}
|
|
|
|
// getServerFromParams extracts server address from unified sources
|
|
func getServerFromParams(params *worker_pb.TaskParams) string {
|
|
if len(params.Sources) > 0 {
|
|
return params.Sources[0].Node
|
|
}
|
|
return ""
|
|
}
|